--- a/app/app.yaml.template Mon May 25 23:42:15 2009 +0200
+++ b/app/app.yaml.template Tue May 26 02:37:39 2009 +0200
@@ -46,6 +46,14 @@
- url: /json
static_dir: json
+- url: /admin/shell.*
+ script: shell/shell.py
+ login: admin
+
+- url: /static
+ static_dir: shell/static
+ expiration: 1d
+
- url: /.*
script: main.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/gae_django.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,61 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2008 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module containing Melange Django 1.0+ configuration for Google App Engine.
+"""
+
+import logging
+import os
+import sys
+
+__authors__ = [
+ # alphabetical order by last name, please
+ '"Pawel Solyga" <pawel.solyga@gmail.com>',
+ ]
+
+
+# Remove the standard version of Django.
+for k in [k for k in sys.modules if k.startswith('django')]:
+ del sys.modules[k]
+
+# Force sys.path to have our own directory first, in case we want to import
+# from it. This lets us replace the built-in Django
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+
+sys.path.insert(0, os.path.abspath('django.zip'))
+
+# Force Django to reload its settings.
+from django.conf import settings
+settings._target = None
+
+# Must set this env var before importing any part of Django
+os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
+
+import django.core.signals
+import django.db
+
+# Log errors.
+def log_exception(*args, **kwds):
+ """Function used for logging exceptions.
+ """
+ logging.exception('Exception in request:')
+
+# Log all exceptions detected by Django.
+django.core.signals.got_request_exception.connect(log_exception)
+
+# Unregister the rollback event handler.
+django.core.signals.got_request_exception.disconnect(
+ django.db._rollback_on_exception)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/comment.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,40 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP specific Comment Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+from django.utils.translation import ugettext
+
+import soc.models.comment
+
+
+class GHOPComment(soc.models.comment.Comment):
+ """GHOP Comment model for tasks, extends the basic Comment model.
+ """
+
+ #: Property containing the human readable string that should be
+ #: shown for the comment when something in the task changes,
+ #: code.google.com issue tracker style
+ change_in_task = db.StringProperty(required=True,
+ verbose_name=ugettext('Changes in the task'))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/org_prize_assignment.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,54 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP PrizePerOrg Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+import soc.models.base
+
+import ghop.models.organization
+import ghop.models.program
+
+
+class GHOPOrgPrizeAssignment(soc.models.base.ModelWithFieldAttributes):
+ """Model for prizes assigned to Students by an Organization.
+ """
+
+ #: Program to which these winners belong to
+ program = db.ReferenceProperty(reference_class=ghop.models.program.GHOPProgram,
+ required=True,
+ collection_name='program_prizes')
+
+ #: Organization to which these winners belong to
+ org = db.ReferenceProperty(
+ reference_class=ghop.models.organization.GHOPOrganization,
+ required=True, collection_name='organization_prizes')
+
+ #: Ordered list of winners(reference to Student entities) for the given
+ #: organization under the specified program
+ winners = db.ListProperty(item_type=db.Key, default=[])
+
+ #: unordered list of runner-ups(reference to Student entities) for the given
+ #: organization under the specified program
+ runner_ups = db.ListProperty(item_type=db.Key, default=[])
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/organization.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,35 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP specific Organization Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+import soc.models.organization
+
+
+class GHOPOrganization(soc.models.organization.Organization):
+ """GHOP Organization model extends the basic Organization model.
+ """
+
+ #: Property that stores the amount of tasks the organization can publish.
+ task_quota_limit = db.IntegerProperty(required=False, default=0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/program.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,76 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP specific Program Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+from django.utils.translation import ugettext
+
+import soc.models.program
+
+
+class GHOPProgram(soc.models.program.Program):
+ """GHOP Program model extends the basic Program model.
+ """
+
+ #: Property that contains the latest date of birth before which a Student
+ #: can participate
+ student_min_age = db.DateTimeProperty(required=False)
+ student_min_age.help_text = ugettext(
+ 'Minimum age of the student to sign-up. Given by the latest birthdate allowed')
+
+ #: Required property containing the number of Tasks Students can work
+ #: on simultaneously. For GHOP it is 1
+ nr_simultaneous_tasks = db.IntegerProperty(
+ required=True, default=1,
+ verbose_name=ugettext('Simultaneous tasks'))
+ nr_simultaneous_tasks.help_text = ugettext(
+ 'Number of tasks students can work on simultaneously in the program.')
+
+ #: Property containing the number of winners per Organization
+ nr_winners = db.IntegerProperty(
+ required=True, default=0,
+ verbose_name=ugettext('Winners per organization'))
+ nr_winners.help_text = ugettext(
+ 'Number of winners an organization can announce.')
+
+ #: Property containing the number of runner ups per Organization
+ nr_runnerups = db.IntegerProperty(
+ required=True, default=0,
+ verbose_name=ugettext('Runner-ups per organization'))
+ nr_runnerups.help_text = ugettext(
+ 'Number of runner-ups an organization can announce.')
+
+ #: A list of difficulty levels that can be assigned for each Task created
+ task_difficulties = db.StringListProperty(
+ required=True, default=[''],
+ verbose_name=ugettext('Difficulty levels'))
+ task_difficulties.help_text = ugettext(
+ 'List all the difficulty levels that can be assigned to a task.')
+
+ #: A list of task types that a Task can belong to
+ task_types = db.StringListProperty(
+ required=True, default=['Any'],
+ verbose_name=ugettext('Task Types'))
+ task_rypes.help_text = ugettext(
+ 'List all the types a task can be in.')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/task.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,181 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP Task Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+ '"Lennard de Rijk" <ljvderijk@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+from django.utils.translation import ugettext
+
+import soc.models.linkable
+import soc.models.role
+import soc.models.student
+import soc.models.user
+
+import ghop.models.program
+
+
+class GHOPTask(soc.models.linkable.Linkable):
+ """Model for a task used in GHOP workflow.
+
+ The scope property of Linkable will be set to the Organization to which
+ this task belongs to. A link_id will be generated automatically and will
+ have no specific meaning other than identification.
+ """
+
+ #: Required field indicating the "title" of the task
+ title = db.StringProperty(required=True,
+ verbose_name=ugettext('Title'))
+ title.help_text = ugettext('Title of the task')
+
+ #: Required field containing the description of the task
+ description = db.TextProperty(required=True,
+ verbose_name=ugettext('Description'))
+ description.help_text = ugettext('Complete description of the task')
+
+ #: Field indicating the difficulty level of the Task. This is not
+ #: mandatory so the it can be assigned at any later stage.
+ #: The options are configured by a Program Admin.
+ difficulty = db.StringProperty(required=False,
+ verbose_name=ugettext('Difficulty'))
+ difficulty.help_text = ugettext('Difficulty Level of the task')
+
+ #: Required field which contains the type of the task. These types are
+ #: configured by a Program Admin.
+ type = db.StringListProperty(required=True,
+ verbose_name=ugettext('Task Type'))
+ type.help_text = ugettext('Type of the task')
+
+ #: A field which contains time allowed for completing the task (in hours)
+ #: from the moment that this task has been assigned to a Student
+ time_to_complete = db.IntegerProperty(required=True,
+ verbose_name=('Time to Complete'))
+ time_to_complete.help_text = ugettext(
+ 'Time allowed to complete the task, in hours, once it is claimed')
+
+ #: List of Mentors assigned to this task. A Mentor who creates this
+ #: task is assigned as the Mentor by default. An Org Admin will have
+ #: to assign a Mentor upon task creation.
+ mentors = db.ListProperty(item_type=db.Key, default=[])
+
+ #: User profile to whom this task has been claimed by. This field
+ #: is mandatory for claimed tasks
+ user = db.ReferenceProperty(reference_class=soc.models.user.User,
+ required=False,
+ collection_name='assigned_tasks')
+
+ #: Student profile to whom this task is currently assigned to. If the user
+ #: has registered as a Student than this field will be filled in. This field
+ #: is mandatory for all Tasks in the closed state.
+ student = db.ReferenceProperty(reference_class=soc.models.student.Student,
+ required=False,
+ collection_name='assigned_tasks')
+
+ #: Program in which this Task has been created
+ program = db.ReferenceProperty(reference_class=ghop.models.program.GHOPProgram,
+ required=True,
+ collection_name='tasks')
+
+ #: Required property which holds the state, the Task is currently in.
+ #: This is a hidden field not shown on forms. Handled by logic internally.
+ #: The state can be one of the following:
+ #: unapproved: If Task is created by a Mentor, this is the automatically
+ #: assigned state.
+ #: unpublished: This Task is not published yet.
+ #: open: This Task is open and ready to be claimed.
+ #: reopened: This Task has been claimed but never finished and has been
+ #: reopened.
+ #: claim_requested: A Student has requested to claim this task.
+ #: claimed: This Task has been claimed and someone is working on it.
+ #: action_needed: Work on this Task must be submitted for review within
+ #: 24 hours.
+ #: closed: Work on this Task has been completed to the org's content.
+ #: awaiting_registration: Student has completed work on this task, but
+ #: needs to complete Student registration before this task is closed.
+ #: needs_work: This work on this Tasks needs a bit more brushing up. This
+ #: state is followed by a Mentor review.
+ #: needs_review: Student has submitted work for this task and it should
+ #: be reviewed by a Mentor.
+ status = db.StringProperty(
+ required=True, verbose_name=ugettext('Status'),
+ choices=['unapproved', 'unpublished', 'open', 'reopened',
+ 'claim_requested', 'claimed', 'action_needed',
+ 'closed', 'awaiting_registration', 'needs_work',
+ 'needs_review'],
+ default='unapproved')
+
+ #: A field which indicates if the Task was ever in the Reopened state.
+ #: True indicates that its state was Reopened once, false indicated that it
+ #: has never been in the Reopened state.
+ was_reopened = db.BooleanProperty(default=False,
+ verbose_name=ugettext('Has been reopened'))
+
+ #: This field is set to the next deadline that will have consequences for
+ #: this Task. For instance this will store a DateTime property which will
+ #: tell when this Task should be completed.
+ deadline = db.DateTimeProperty(required=False,
+ verbose_name=ugettext('Deadline'))
+
+ #: Required field containing the Mentor/Org Admin who created this task
+ created_by = db.ReferenceProperty(reference_class=soc.models.role.Role,
+ required=True,
+ collection_name='created_tasks',
+ verbose_name=ugettext('Created by'))
+
+ #: Date when the proposal was created
+ created_on = db.DateTimeProperty(required=True, auto_now_add=True,
+ verbose_name=ugettext('Created on'))
+
+ #: Required field containing the Mentor/Org Admin who last edited this
+ #: task. It changes only when Mentor/Org Admin changes title, description,
+ #: difficulty, type, time_to_complete.
+ modified_by = db.ReferenceProperty(reference_class=soc.models.role.Role,
+ required=True,
+ collection_name='edited_tasks',
+ verbose_name=ugettext('Modified by'))
+
+ #: Date when the proposal was last modified, should be set manually on edit
+ modified_on = db.DateTimeProperty(required=True, auto_now_add=True,
+ verbose_name=ugettext('Modified on'))
+
+ #: A field which holds the entire history of this task in JSON. The
+ #: structure of this JSON string is as follows:
+ #: {
+ #: timestamp1: {
+ #: "user": User reference
+ #: "student": Student reference
+ #: ...
+ #: "state": "Unapproved"
+ #: ...
+ #: "edited_by": Role reference
+ #:
+ #: }
+ #: timestamp2: {
+ #: "state": "Unpublished"
+ #: }
+ #: }
+ #: First dictionary item holds the values for all the properties in this
+ #: model. The subsequent items hold the properties that changed at the
+ #: timestamp given by the key.
+ #: Reference properties will be stored by calling str() on their Key.
+ history = db.TextProperty(required=True, default='')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/timeline.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,62 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP specific Timeline Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+from django.utils.translation import ugettext
+
+import soc.models.timeline
+
+
+class GHOPTimeline(soc.models.timeline.Timeline):
+ """GHOP Timeline model extends the basic Timeline model. It implements
+ the GHOP specific timeline entries.
+ """
+
+ task_claim_deadline = db.DateTimeProperty(
+ verbose_name=ugettext('Task Claim Deadline date'))
+ task_claim_deadline.help_text = ugettext(
+ 'No tasks can be claimed after this date.'
+ 'Work on claimed tasks can continue.')
+
+ stop_all_work = db.DateTimeProperty(
+ verbose_name=ugettext('Work Submission Deadline date'))
+ stop_all_work.help_text = ugettext(
+ 'All work must stop by this date.')
+
+ winner_selection_start = db.DateTimeProperty(
+ verbose_name=ugettext('Winner Selection Start date'))
+ winner_selection_start.help_text = ugettext(
+ 'Organizations start choosing their winners.')
+
+ winner_selection_end = db.DateTimeProperty(
+ verbose_name=ugettext('Winner Selection End date'))
+ winner_selection_end.help_text = ugettext(
+ 'Organizations must have completed choosing their winners.')
+
+ winner_announcement = db.DateTimeProperty(
+ verbose_name=ugettext('Winner Announcement date'))
+ winner_announcement.help_text = ugettext(
+ 'All winners are announced.')
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ghop/models/work_submission.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,72 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module contains the GHOP WorkSubmission Model.
+"""
+
+__authors__ = [
+ '"Madhusudan.C.S" <madhusudancs@gmail.com>',
+ '"Lennard de Rijk" <ljvderijk@gmail.com>',
+]
+
+
+from google.appengine.ext import db
+
+from django.utils.translation import ugettext
+
+import soc.models.linkable
+import soc.models.user
+
+import ghop.models.program
+import ghop.models.task
+
+
+class GHOPWorkSubmission(soc.models.linkable.Linkable):
+ """Model for work submissions for a task by students.
+
+ Scope will be set to the Organization to which this work has been submitted.
+ """
+
+ #: Task to which this work was submitted
+ task = db.ReferenceProperty(reference_class=ghop.models.task.GHOPTask,
+ required=True,
+ collection_name='work_submissions')
+
+ #: User who submitted this work
+ user = db.ReferenceProperty(reference_class=soc.models.user.User,
+ required=True,
+ collection_name='work_submissions')
+
+ #: Program to which this work belongs to
+ program = db.ReferenceProperty(reference_class=ghop.models.program.GHOPProgram,
+ required=True,
+ collection_name='work_submissions')
+
+ #: Property allowing you to store information about your work
+ information = db.TextProperty(
+ required=True, verbose_name=ugettext('Info'))
+ information.help_text = ugettext(
+ 'Information about the work you submit for this task')
+
+ #: Property containing an URL to this work or more information about it
+ url_to_work = db.LinkProperty(
+ required=False, verbose_name=ugettext('URL to your Work'))
+ url_to_work.help_text = ugettext(
+ 'URL to a resource containing your work or more information about it')
+
+ #: Property containing the date when the work was submitted
+ submitted_on = db.DateTimeProperty(required=True, auto_now_add=True,
+ verbose_name=ugettext('Submitted on'))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/BeautifulSoup.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,2000 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+ http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+ by stock Python.
+ http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid. This class has web browser-like heuristics for
+ obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2009, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.1.0.1"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__license__ = "New-style BSD"
+
+import codecs
+import markupbase
+import types
+import re
+from HTMLParser import HTMLParser, HTMLParseError
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+# First, the classes that represent markup elements.
+
+def sob(unicode, encoding):
+ """Returns either the given Unicode string or its encoding."""
+ if encoding is None:
+ return unicode
+ else:
+ return unicode.encode(encoding)
+
+class PageElement:
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.contents.index(self)
+ if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
+ # We're replacing this element with one of its siblings.
+ index = self.parent.contents.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ self.parent.contents.remove(self)
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if (isinstance(newChild, basestring)
+ or isinstance(newChild, unicode)) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent != None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent == self:
+ index = self.find(newChild)
+ if index and index < position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
+
+ def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.findNextSiblings, name, attrs, text,
+ **kwargs)
+
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.nextSiblingGenerator, **kwargs)
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+ def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+ def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ **kwargs)
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.previousSiblingGenerator, **kwargs)
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+ def findParent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.findParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ **kwargs)
+ fetchParents = findParents # Compatibility with pre-3.x
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ else:
+ # Build a SoupStrainer
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These Generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ def nextGenerator(self):
+ i = self
+ while i:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i:
+ i = i.parent
+ yield i
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (unicode(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.decode().encode(encoding)
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return self
+
+class CData(NavigableString):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<![CDATA[' + self + u']]>'
+
+class ProcessingInstruction(NavigableString):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ output = self
+ if u'%SOUP-ENCODING%' in output:
+ output = self.substituteEncoding(output, eventualEncoding)
+ return u'<?' + output + u'?>'
+
+class Comment(NavigableString):
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!--' + self + u'-->'
+
+class Declaration(NavigableString):
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!' + self + u'>'
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&%s;' % x
+ else:
+ return u'&%s;' % x
+
+ def __init__(self, parser, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected
+ self.parserClass = parser.__class__
+ self.isSelfClosing = parser.isSelfClosingTag(name)
+ self.name = name
+ if attrs == None:
+ attrs = []
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+ self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+ def convert(kval):
+ "Converts HTML, XML and numeric entities in the attribute value."
+ k, val = kval
+ if val is None:
+ return kval
+ return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities, val))
+ self.attrs = map(convert, self.attrs)
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def has_key(self, key):
+ return self._getAttrMap().has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ findAll() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.findAll, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.decode(eventualEncoding=encoding)
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+ def __unicode__(self):
+ return self.decode()
+
+ def __str__(self):
+ return self.encode()
+
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+
+ def decode(self, prettyPrint=False, indentLevel=0,
+ eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding."""
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ fmt = '%s="%s"'
+ if isString(val):
+ if (self.containsSubstitutions
+ and eventualEncoding is not None
+ and '%SOUP-ENCODING%' in val):
+ val = self.substituteEncoding(val, eventualEncoding)
+
+ # The attribute value either:
+ #
+ # * Contains no embedded double quotes or single quotes.
+ # No problem: we enclose it in double quotes.
+ # * Contains embedded single quotes. No problem:
+ # double quotes work here too.
+ # * Contains embedded double quotes. No problem:
+ # we enclose it in single quotes.
+ # * Embeds both single _and_ double quotes. This
+ # can't happen naturally, but it can happen if
+ # you modify an attribute value after parsing
+ # the document. Now we have a bit of a
+ # problem. We solve it by enclosing the
+ # attribute in single quotes, and escaping any
+ # embedded single quotes to XML entities.
+ if '"' in val:
+ fmt = "%s='%s'"
+ if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
+ val = val.replace("'", "&squot;")
+
+ # Now we're okay w/r/t quotes. But the attribute
+ # value might also contain angle brackets, or
+ # ampersands that aren't part of entities. We need
+ # to escape those to XML entities too.
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+ if val is None:
+ # Handle boolean attributes.
+ decoded = key
+ else:
+ decoded = fmt % (key, val)
+ attrs.append(decoded)
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing:
+ close = ' /'
+ else:
+ closeTag = '</%s>' % self.name
+
+ indentTag, indentContents = 0, 0
+ if prettyPrint:
+ indentTag = indentLevel
+ space = (' ' * (indentTag-1))
+ indentContents = indentTag + 1
+ contents = self.decodeContents(prettyPrint, indentContents,
+ eventualEncoding)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if prettyPrint:
+ s.append(space)
+ s.append('<%s%s%s>' % (self.name, attributeString, close))
+ if prettyPrint:
+ s.append("\n")
+ s.append(contents)
+ if prettyPrint and contents and contents[-1] != "\n":
+ s.append("\n")
+ if prettyPrint and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if prettyPrint and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ contents = [i for i in self.contents]
+ for i in contents:
+ if isinstance(i, Tag):
+ i.decompose()
+ else:
+ i.extract()
+ self.extract()
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.encode(encoding, True)
+
+ def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
+
+ def decodeContents(self, prettyPrint=False, indentLevel=0,
+ eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders the contents of this tag as a string in the given
+ encoding. If encoding is None, returns a Unicode string.."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.decodeGivenEventualEncoding(eventualEncoding)
+ elif isinstance(c, Tag):
+ s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
+ if text and prettyPrint:
+ text = text.strip()
+ if text:
+ if prettyPrint:
+ s.append(" " * (indentLevel-1))
+ s.append(text)
+ if prettyPrint:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def findAll(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ findChildren = findAll
+
+ # Pre-3.x compatibility methods. Will go away in 4.0.
+ first = find
+ fetch = findAll
+
+ def fetchText(self, text=None, recursive=True, limit=None):
+ return self.findAll(text=text, recursive=recursive, limit=limit)
+
+ def firstText(self, text=None, recursive=True):
+ return self.find(text=text, recursive=recursive)
+
+ # 3.x compatibility methods. Will go away in 4.0.
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ if encoding is None:
+ return self.decodeContents(prettyPrint, indentLevel, encoding)
+ else:
+ return self.encodeContents(encoding, prettyPrint, indentLevel)
+
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+ def childGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ current = self.contents[0]
+ while current:
+ yield current
+ current = current.nextSibling
+ raise StopIteration
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isString(attrs):
+ kwargs['class'] = attrs
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if isList(markup) and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isString(markup):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst == True and type(matchAgainst) == types.BooleanType:
+ result = markup != None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup is not None and not isString(markup):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif (isList(matchAgainst)
+ and (markup is not None or not isString(matchAgainst))):
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isString(markup):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
+
+# Now, some helper functions.
+
+def isList(l):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is listlike."""
+ return ((hasattr(l, '__iter__') and not isString(l))
+ or (type(l) in (types.ListType, types.TupleType)))
+
+def isString(s):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is stringlike."""
+ try:
+ return isinstance(s, unicode) or isinstance(s, basestring)
+ except NameError:
+ return isinstance(s, str)
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+ NESTING_RESET_TAGS maps out of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif isList(portion) and not isString(portion):
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+# Now, the parser classes.
+
+class HTMLParserBuilder(HTMLParser):
+
+ def __init__(self, soup):
+ HTMLParser.__init__(self)
+ self.soup = soup
+
+ # We inherit feed() and reset().
+
+ def handle_starttag(self, name, attrs):
+ if name == 'meta':
+ self.soup.extractCharsetFromMeta(attrs)
+ else:
+ self.soup.unknown_starttag(name, attrs)
+
+ def handle_endtag(self, name):
+ self.soup.unknown_endtag(name)
+
+ def handle_data(self, content):
+ self.soup.handle_data(content)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.soup.endData()
+ self.handle_data(text)
+ self.soup.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.soup.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.soup.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.soup.convertXMLEntities:
+ data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.soup.convertHTMLEntities and \
+ not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = HTMLParser.parse_declaration(self, i)
+ except HTMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+
+class BeautifulStoneSoup(Tag):
+
+ """This class contains the basic parser and search code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "<foo><bar></foo>" actually means
+ "<foo><bar></bar></foo>".
+
+ [Another possible explanation is "<foo><bar /></foo>", but since
+ this class defines no SELF_CLOSING_TAGS, it will never use that
+ explanation.]
+
+ This class is useful for parsing XML or made-up markup languages,
+ or when BeautifulSoup makes an assumption counter to what you were
+ expecting."""
+
+ SELF_CLOSING_TAGS = {}
+ NESTABLE_TAGS = {}
+ RESET_NESTING_TAGS = {}
+ QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
+
+ MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda x: x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda x: '<!' + x.group(1) + '>')
+ ]
+
+ ROOT_TAG_NAME = u'[document]'
+
+ HTML_ENTITIES = "html"
+ XML_ENTITIES = "xml"
+ XHTML_ENTITIES = "xhtml"
+ # TODO: This only exists for backwards-compatibility
+ ALL_ENTITIES = XHTML_ENTITIES
+
+ # Used when determining whether a text node is all whitespace and
+ # can be replaced with a single space. A text node that contains
+ # fancy Unicode spaces (usually non-breaking) should be left
+ # alone.
+ STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+ markupMassage=True, smartQuotesTo=XML_ENTITIES,
+ convertEntities=None, selfClosingTags=None, isHTML=False,
+ builder=HTMLParserBuilder):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser.
+
+ HTMLParser will process most bad HTML, and the BeautifulSoup
+ class has some tricks for dealing with some HTML that kills
+ HTMLParser, but Beautiful Soup can nonetheless choke or lose data
+ if your data uses self-closing tags or declarations
+ incorrectly.
+
+ By default, Beautiful Soup uses regexes to sanitize input,
+ avoiding the vast majority of these problems. If the problems
+ don't apply to you, pass in False for markupMassage, and
+ you'll get better performance.
+
+ The default parser massage techniques fix the two most common
+ instances of invalid HTML that choke HTMLParser:
+
+ <br/> (No space between name of closing tag and tag close)
+ <! --Comment--> (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+
+ self.parseOnlyThese = parseOnlyThese
+ self.fromEncoding = fromEncoding
+ self.smartQuotesTo = smartQuotesTo
+ self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
+ if self.convertEntities:
+ # It doesn't make sense to convert encoded characters to
+ # entities even while you're converting entities to Unicode.
+ # Just convert it all to Unicode.
+ self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
+ self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+ self.builder = builder(self)
+ self.reset()
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ self.markup = markup
+ self.markupMassage = markupMassage
+ try:
+ self._feed(isHTML=isHTML)
+ except StopParsing:
+ pass
+ self.markup = None # The markup can now be GCed.
+ self.builder = None # So can the builder.
+
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
+ # Convert the document to Unicode.
+ markup = self.markup
+ if isinstance(markup, unicode):
+ if not hasattr(self, 'originalEncoding'):
+ self.originalEncoding = None
+ else:
+ dammit = UnicodeDammit\
+ (markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ markup = dammit.unicode
+ self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+ if markup:
+ if self.markupMassage:
+ if not isList(self.markupMassage):
+ self.markupMassage = self.MARKUP_MASSAGE
+ for fix, m in self.markupMassage:
+ markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
+ self.builder.reset()
+
+ self.builder.feed(markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def isSelfClosingTag(self, name):
+ """Returns true iff the given string is the name of a
+ self-closing tag according to this parser."""
+ return self.SELF_CLOSING_TAGS.has_key(name) \
+ or self.instanceSelfClosingTags.has_key(name)
+
+ def reset(self):
+ Tag.__init__(self, self, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ self.builder.reset()
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.quoteStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+ # Tags with just one string-owning child get the child as a
+ # 'string' property, so that soup.tag.string is shorthand for
+ # soup.tag.contents[0]
+ if len(self.currentTag.contents) == 1 and \
+ isinstance(self.currentTag.contents[0], NavigableString):
+ self.currentTag.string = self.currentTag.contents[0]
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self, containerClass=NavigableString):
+ if self.currentData:
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ self.currentData = []
+ if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
+ return
+ o = containerClass(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in range(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+ <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+ <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
+
+ <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+ <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+ <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers != None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers == None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "</%s> is not real!" % name
+ self.handle_data('</%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def extractCharsetFromMeta(self, attrs):
+ self.unknown_starttag('meta', attrs)
+
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a <p> tag should implicitly close the previous <p> tag.
+
+ <p>Para1<p>Para2
+ should be transformed into:
+ <p>Para1</p><p>Para2
+
+ Some tags can be nested arbitrarily. For instance, the occurance
+ of a <blockquote> tag should _not_ implicitly close the previous
+ <blockquote> tag.
+
+ Alice said: <blockquote>Bob said: <blockquote>Blah
+ should NOT be transformed into:
+ Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+ Some tags can be nested, but the nesting is reset by the
+ interposition of other tags. For instance, a <tr> tag should
+ implicitly close the previous <tr> tag within the same <table>,
+ but not close a <tr> tag in another table.
+
+ <table><tr>Blah<tr>Blah
+ should be transformed into:
+ <table><tr>Blah</tr><tr>Blah
+ but,
+ <tr>Blah<table><tr>Blah
+ should NOT be transformed into
+ <tr>Blah<table></tr><tr>Blah
+
+ Differing assumptions about tag nesting rules are a major source
+ of problems with the BeautifulSoup class. If BeautifulSoup is not
+ treating as nestable a tag your page author treats as nestable,
+ try ICantBelieveItsBeautifulSoup, MinimalSoup, or
+ BeautifulStoneSoup before writing your own subclass."""
+
+ def __init__(self, *args, **kwargs):
+ if not kwargs.has_key('smartQuotesTo'):
+ kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
+ BeautifulStoneSoup.__init__(self, *args, **kwargs)
+
+ SELF_CLOSING_TAGS = buildTagMap(None,
+ ['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
+
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
+ QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center']
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+
+ #Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ 'thead' : ['table'],
+ 'tbody' : ['table'],
+ 'tfoot' : ['table'],
+ }
+
+ NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+ NON_NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS,
+ NESTABLE_TABLE_TAGS)
+
+ NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+
+ # Used to detect the charset in a META tag; see start_meta
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def extractCharsetFromMeta(self, attrs):
+ """Beautiful Soup can detect a charset included in a META tag,
+ try to convert the document to that charset, and re-parse the
+ document from the beginning."""
+ httpEquiv = None
+ contentType = None
+ contentTypeIndex = None
+ tagNeedsEncodingSubstitution = False
+
+ for i in range(0, len(attrs)):
+ key, value = attrs[i]
+ key = key.lower()
+ if key == 'http-equiv':
+ httpEquiv = value
+ elif key == 'content':
+ contentType = value
+ contentTypeIndex = i
+
+ if httpEquiv and contentType: # It's an interesting meta tag.
+ match = self.CHARSET_RE.search(contentType)
+ if match:
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+ attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+ newAttr)
+ tagNeedsEncodingSubstitution = True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ newCharset = match.group(3)
+ if newCharset and newCharset != self.originalEncoding:
+ self.declaredHTMLEncoding = newCharset
+ self._feed(self.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ tag = self.unknown_starttag("meta", attrs)
+ if tag and tagNeedsEncodingSubstitution:
+ tag.containsSubstitutions = True
+
+
+class StopParsing(Exception):
+ pass
+
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+ """The BeautifulSoup class is oriented towards skipping over
+ common HTML errors like unclosed tags. However, sometimes it makes
+ errors of its own. For instance, consider this fragment:
+
+ <b>Foo<b>Bar</b></b>
+
+ This is perfectly valid (if bizarre) HTML. However, the
+ BeautifulSoup class will implicitly close the first b tag when it
+ encounters the second 'b'. It will think the author wrote
+ "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+ there's no real-world reason to bold something that's already
+ bold. When it encounters '</b></b>' it will close two more 'b'
+ tags, for a grand total of three tags closed instead of two. This
+ can throw off the rest of your document structure. The same is
+ true of a number of other tags, listed below.
+
+ It's much more common for someone to forget to close a 'b' tag
+ than to actually use nested 'b' tags, and the BeautifulSoup class
+ handles the common case. This class handles the not-co-common
+ case: where you can't believe someone wrote what they did, but
+ it's valid HTML and BeautifulSoup screwed up by assuming it
+ wouldn't be."""
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+ ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big']
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
+
+ NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class MinimalSoup(BeautifulSoup):
+ """The MinimalSoup class is for parsing HTML that contains
+ pathologically bad markup. It makes no assumptions about tag
+ nesting, but it does know which tags are self-closing, that
+ <script> tags contain Javascript and should not be parsed, that
+ META tags may contain encoding information, and so on.
+
+ This also makes it better for subclassing than BeautifulStoneSoup
+ or BeautifulSoup."""
+
+ RESET_NESTING_TAGS = buildTagMap('noscript')
+ NESTABLE_TAGS = {}
+
+class BeautifulSOAP(BeautifulStoneSoup):
+ """This class will push a tag with only a single string child into
+ the tag's parent as an attribute. The attribute's name is the tag
+ name, and the value is the string child. An example should give
+ the flavor of the change:
+
+ <foo><bar>baz</bar></foo>
+ =>
+ <foo bar="baz"><bar>baz</bar></foo>
+
+ You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+ This is, of course, useful for scraping structures that tend to
+ use subelements instead of attributes, such as SOAP messages. Note
+ that it modifies its input, so don't print the modified version
+ out.
+
+ I'm not sure how many people really want to use this class; let me
+ know if you do. Mainly I like the name."""
+
+ def popTag(self):
+ if len(self.tagStack) > 1:
+ tag = self.tagStack[-1]
+ parent = self.tagStack[-2]
+ parent._getAttrMap()
+ if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+ isinstance(tag.contents[0], NavigableString) and
+ not parent.attrMap.has_key(tag.name)):
+ parent[tag.name] = tag.contents[0]
+ BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+ pass
+class RobustHTMLParser(BeautifulSoup):
+ pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+ pass
+class RobustInsanelyWackAssHTMLParser(MinimalSoup):
+ pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+ pass
+
+######################################################
+#
+# Bonus library: Unicode, Dammit
+#
+# This class forces XML data into a standard format (usually to UTF-8
+# or Unicode). It is heavily based on code from Mark Pilgrim's
+# Universal Feed Parser. It does not rewrite the XML or HTML to
+# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
+# (XML) and BeautifulSoup.start_meta (HTML).
+
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+# import chardet.constants
+# chardet.constants._debug = 1
+except ImportError:
+ chardet = None
+
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+class UnicodeDammit:
+ """A class for detecting the encoding of a *ML document and
+ converting it to a Unicode string. If the source encoding is
+ windows-1252, can replace MS smart quotes with their HTML or XML
+ equivalents."""
+
+ # This dictionary maps commonly seen values for "charset" in HTML
+ # meta tags to the corresponding Python codec names. It only covers
+ # values that aren't in Python's aliases and can't be determined
+ # by the heuristics in find_codec.
+ CHARSET_ALIASES = { "macintosh" : "mac-roman",
+ "x-sjis" : "shift-jis" }
+
+ def __init__(self, markup, overrideEncodings=[],
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
+ self.markup, documentEncoding, sniffedEncoding = \
+ self._detectEncoding(markup, isHTML)
+ self.smartQuotesTo = smartQuotesTo
+ self.triedEncodings = []
+ if markup == '' or isinstance(markup, unicode):
+ self.originalEncoding = None
+ self.unicode = unicode(markup)
+ return
+
+ u = None
+ for proposedEncoding in overrideEncodings:
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+ if not u:
+ for proposedEncoding in (documentEncoding, sniffedEncoding):
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+
+ # If no luck and we have auto-detection library, try that:
+ if not u and chardet and not isinstance(self.markup, unicode):
+ u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+ # As a last resort, try utf-8 and windows-1252:
+ if not u:
+ for proposed_encoding in ("utf-8", "windows-1252"):
+ u = self._convertFrom(proposed_encoding)
+ if u: break
+
+ self.unicode = u
+ if not u: self.originalEncoding = None
+
+ def _subMSChar(self, match):
+ """Changes a MS smart quote character to an XML or HTML
+ entity."""
+ orig = match.group(1)
+ sub = self.MS_CHARS.get(orig)
+ if type(sub) == types.TupleType:
+ if self.smartQuotesTo == 'xml':
+ sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+ else:
+ sub = '&'.encode() + sub[0].encode() + ';'.encode()
+ else:
+ sub = sub.encode()
+ return sub
+
+ def _convertFrom(self, proposed):
+ proposed = self.find_codec(proposed)
+ if not proposed or proposed in self.triedEncodings:
+ return None
+ self.triedEncodings.append(proposed)
+ markup = self.markup
+
+ # Convert smart quotes to HTML if coming from an encoding
+ # that might have them.
+ if self.smartQuotesTo and proposed.lower() in("windows-1252",
+ "iso-8859-1",
+ "iso-8859-2"):
+ smart_quotes_re = "([\x80-\x9f])"
+ smart_quotes_compiled = re.compile(smart_quotes_re)
+ markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+
+ try:
+ # print "Trying to convert document to %s" % proposed
+ u = self._toUnicode(markup, proposed)
+ self.markup = u
+ self.originalEncoding = proposed
+ except Exception, e:
+ # print "That didn't work!"
+ # print e
+ return None
+ #print "Correct encoding: %s" % proposed
+ return self.markup
+
+ def _toUnicode(self, data, encoding):
+ '''Given a string and its encoding, decodes the string into Unicode.
+ %encoding is a string recognized by encodings.aliases'''
+
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == '\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == '\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == '\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ return newdata
+
+ def _detectEncoding(self, xml_data, isHTML=False):
+ """Given a document, tries to detect its XML encoding."""
+ xml_encoding = sniffed_xml_encoding = None
+ try:
+ if xml_data[:4] == '\x4c\x6f\xa7\x94':
+ # EBCDIC
+ xml_data = self._ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == '\x00\x3c\x00\x3f':
+ # UTF-16BE
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+ and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ # UTF-16LE
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+ (xml_data[2:4] != '\x00\x00'):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\x00\x3c':
+ # UTF-32BE
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x00\x00':
+ # UTF-32LE
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\xfe\xff':
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\xff\xfe\x00\x00':
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == '\xef\xbb\xbf':
+ # UTF-8 with BOM
+ sniffed_xml_encoding = 'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ sniffed_xml_encoding = 'ascii'
+ pass
+ except:
+ xml_encoding_match = None
+ xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
+ xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+ if not xml_encoding_match and isHTML:
+ meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
+ regexp = re.compile(meta_re, re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
+ xml_encoding = xml_encoding_match.groups()[0].decode(
+ 'ascii').lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
+ if sniffed_xml_encoding and \
+ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+ 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+ 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+ 'utf16', 'u16')):
+ xml_encoding = sniffed_xml_encoding
+ return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+ def find_codec(self, charset):
+ return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+ or (charset and self._codec(charset.replace("-", ""))) \
+ or (charset and self._codec(charset.replace("-", "_"))) \
+ or charset
+
+ def _codec(self, charset):
+ if not charset: return charset
+ codec = None
+ try:
+ codecs.lookup(charset)
+ codec = charset
+ except (LookupError, ValueError):
+ pass
+ return codec
+
+ EBCDIC_TO_ASCII_MAP = None
+ def _ebcdic_to_ascii(self, s):
+ c = self.__class__
+ if not c.EBCDIC_TO_ASCII_MAP:
+ emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+ 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+ 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+ 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+ 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+ 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+ 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+ 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+ 250,251,252,253,254,255)
+ import string
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+ MS_CHARS = { '\x80' : ('euro', '20AC'),
+ '\x81' : ' ',
+ '\x82' : ('sbquo', '201A'),
+ '\x83' : ('fnof', '192'),
+ '\x84' : ('bdquo', '201E'),
+ '\x85' : ('hellip', '2026'),
+ '\x86' : ('dagger', '2020'),
+ '\x87' : ('Dagger', '2021'),
+ '\x88' : ('circ', '2C6'),
+ '\x89' : ('permil', '2030'),
+ '\x8A' : ('Scaron', '160'),
+ '\x8B' : ('lsaquo', '2039'),
+ '\x8C' : ('OElig', '152'),
+ '\x8D' : '?',
+ '\x8E' : ('#x17D', '17D'),
+ '\x8F' : '?',
+ '\x90' : '?',
+ '\x91' : ('lsquo', '2018'),
+ '\x92' : ('rsquo', '2019'),
+ '\x93' : ('ldquo', '201C'),
+ '\x94' : ('rdquo', '201D'),
+ '\x95' : ('bull', '2022'),
+ '\x96' : ('ndash', '2013'),
+ '\x97' : ('mdash', '2014'),
+ '\x98' : ('tilde', '2DC'),
+ '\x99' : ('trade', '2122'),
+ '\x9a' : ('scaron', '161'),
+ '\x9b' : ('rsaquo', '203A'),
+ '\x9c' : ('oelig', '153'),
+ '\x9d' : '?',
+ '\x9e' : ('#x17E', '17E'),
+ '\x9f' : ('Yuml', ''),}
+
+#######################################################################
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print soup.prettify()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/BeautifulSoupTests.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,826 @@
+# -*- coding: utf-8 -*-
+"""Unit tests for Beautiful Soup.
+
+These tests make sure the Beautiful Soup works as it should. If you
+find a bug in Beautiful Soup, the best way to express it is as a test
+case like this that fails."""
+
+import unittest
+from BeautifulSoup import *
+
+class SoupTest(unittest.TestCase):
+
+ def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
+ encoding=None):
+ """Parse the given text and make sure its string rep is the other
+ given text."""
+ if rep == None:
+ rep = toParse
+ obj = c(toParse)
+ if encoding is None:
+ rep2 = obj.decode()
+ else:
+ rep2 = obj.encode(encoding)
+ self.assertEqual(rep2, rep)
+
+class FollowThatTag(SoupTest):
+
+ "Tests the various ways of fetching tags from a soup."
+
+ def setUp(self):
+ ml = """
+ <a id="x">1</a>
+ <A id="a">2</a>
+ <b id="b">3</a>
+ <b href="foo" id="x">4</a>
+ <ac width=100>4</ac>"""
+ self.soup = BeautifulStoneSoup(ml)
+
+ def testFindAllByName(self):
+ matching = self.soup('a')
+ self.assertEqual(len(matching), 2)
+ self.assertEqual(matching[0].name, 'a')
+ self.assertEqual(matching, self.soup.findAll('a'))
+ self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
+
+ def testFindAllByAttribute(self):
+ matching = self.soup.findAll(id='x')
+ self.assertEqual(len(matching), 2)
+ self.assertEqual(matching[0].name, 'a')
+ self.assertEqual(matching[1].name, 'b')
+
+ matching2 = self.soup.findAll(attrs={'id' : 'x'})
+ self.assertEqual(matching, matching2)
+
+ strainer = SoupStrainer(attrs={'id' : 'x'})
+ self.assertEqual(matching, self.soup.findAll(strainer))
+
+ self.assertEqual(len(self.soup.findAll(id=None)), 1)
+
+ self.assertEqual(len(self.soup.findAll(width=100)), 1)
+ self.assertEqual(len(self.soup.findAll(junk=None)), 5)
+ self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
+
+ self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
+ self.assertEqual(len(self.soup.findAll(junk=True)), 0)
+
+ self.assertEqual(len(self.soup.findAll(junk=True)), 0)
+ self.assertEqual(len(self.soup.findAll(href=True)), 1)
+
+ def testFindallByClass(self):
+ soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
+ self.assertEqual(soup.find('a', '1').string, "Bar")
+
+ def testFindAllByList(self):
+ matching = self.soup(['a', 'ac'])
+ self.assertEqual(len(matching), 3)
+
+ def testFindAllByHash(self):
+ matching = self.soup({'a' : True, 'b' : True})
+ self.assertEqual(len(matching), 4)
+
+ def testFindAllText(self):
+ soup = BeautifulSoup("<html>\xbb</html>")
+ self.assertEqual(soup.findAll(text=re.compile('.*')),
+ [u'\xbb'])
+
+ def testFindAllByRE(self):
+ import re
+ r = re.compile('a.*')
+ self.assertEqual(len(self.soup(r)), 3)
+
+ def testFindAllByMethod(self):
+ def matchTagWhereIDMatchesName(tag):
+ return tag.name == tag.get('id')
+
+ matching = self.soup.findAll(matchTagWhereIDMatchesName)
+ self.assertEqual(len(matching), 2)
+ self.assertEqual(matching[0].name, 'a')
+
+ def testParents(self):
+ soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
+ b = soup.b
+ self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
+ self.assertEquals(b.findParent('ul')['a'], 'b')
+
+ PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
+
+ def testNext(self):
+ soup = self.PROXIMITY_TEST
+ b = soup.find('b', {'id' : 2})
+ self.assertEquals(b.findNext('b')['id'], '3')
+ self.assertEquals(b.findNext('b')['id'], '3')
+ self.assertEquals(len(b.findAllNext('b')), 2)
+ self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
+
+ def testPrevious(self):
+ soup = self.PROXIMITY_TEST
+ b = soup.find('b', {'id' : 3})
+ self.assertEquals(b.findPrevious('b')['id'], '2')
+ self.assertEquals(b.findPrevious('b')['id'], '2')
+ self.assertEquals(len(b.findAllPrevious('b')), 2)
+ self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
+
+
+ SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
+
+ def testNextSibling(self):
+ soup = self.SIBLING_TEST
+ tag = 'blockquote'
+ b = soup.find(tag, {'id' : 2})
+ self.assertEquals(b.findNext(tag)['id'], '2.1')
+ self.assertEquals(b.findNextSibling(tag)['id'], '3')
+ self.assertEquals(b.findNextSibling(tag)['id'], '3')
+ self.assertEquals(len(b.findNextSiblings(tag)), 2)
+ self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
+
+ def testPreviousSibling(self):
+ soup = self.SIBLING_TEST
+ tag = 'blockquote'
+ b = soup.find(tag, {'id' : 3})
+ self.assertEquals(b.findPrevious(tag)['id'], '2.1')
+ self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
+ self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
+ self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
+ self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
+
+ def testTextNavigation(self):
+ soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
+ baz = soup.find(text='Baz')
+ self.assertEquals(baz.findParent("i")['id'], '1')
+ self.assertEquals(baz.findNext(text='Blee'), 'Blee')
+ self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
+ self.assertEquals(baz.findNextSibling(text='Blargh'), None)
+ self.assertEquals(baz.findNextSibling('hr')['id'], '1')
+
+class SiblingRivalry(SoupTest):
+ "Tests the nextSibling and previousSibling navigation."
+
+ def testSiblings(self):
+ soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
+ secondLI = soup.find('li').nextSibling
+ self.assert_(secondLI.name == 'li' and secondLI.string == '2')
+ self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
+ self.assertEquals(soup.find('p').nextSibling, 'B')
+ self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
+
+class TagsAreObjectsToo(SoupTest):
+ "Tests the various built-in functions of Tag objects."
+
+ def testLen(self):
+ soup = BeautifulSoup("<top>1<b>2</b>3</top>")
+ self.assertEquals(len(soup.top), 3)
+
+class StringEmUp(SoupTest):
+ "Tests the use of 'string' as an alias for a tag's only content."
+
+ def testString(self):
+ s = BeautifulSoup("<b>foo</b>")
+ self.assertEquals(s.b.string, 'foo')
+
+ def testLackOfString(self):
+ s = BeautifulSoup("<b>f<i>e</i>o</b>")
+ self.assert_(not s.b.string)
+
+class ThatsMyLimit(SoupTest):
+ "Tests the limit argument."
+
+ def testBasicLimits(self):
+ s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
+ self.assertEquals(len(s.findAll('br')), 4)
+ self.assertEquals(len(s.findAll('br', limit=2)), 2)
+ self.assertEquals(len(s('br', limit=2)), 2)
+
+class OnlyTheLonely(SoupTest):
+ "Tests the parseOnly argument to the constructor."
+ def setUp(self):
+ x = []
+ for i in range(1,6):
+ x.append('<a id="%s">' % i)
+ for j in range(100,103):
+ x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
+ x.append('</a>')
+ self.x = ''.join(x)
+
+ def testOnly(self):
+ strainer = SoupStrainer("b")
+ soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
+ self.assertEquals(len(soup), 15)
+
+ strainer = SoupStrainer(id=re.compile("100.*"))
+ soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
+ self.assertEquals(len(soup), 5)
+
+ strainer = SoupStrainer(text=re.compile("10[01].*"))
+ soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
+ self.assertEquals(len(soup), 10)
+
+ strainer = SoupStrainer(text=lambda(x):x[8]=='3')
+ soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
+ self.assertEquals(len(soup), 3)
+
+class PickleMeThis(SoupTest):
+ "Testing features like pickle and deepcopy."
+
+ def setUp(self):
+ self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
+"http://www.w3.org/TR/REC-html40/transitional.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
+<link rev="made" href="mailto:leonardr@segfault.org">
+<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
+<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
+<meta name="author" content="Leonard Richardson">
+</head>
+<body>
+<a href="foo">foo</a>
+<a href="foo"><b>bar</b></a>
+</body>
+</html>"""
+
+ self.soup = BeautifulSoup(self.page)
+
+ def testPickle(self):
+ import pickle
+ dumped = pickle.dumps(self.soup, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), self.soup.decode())
+
+ def testDeepcopy(self):
+ from copy import deepcopy
+ deepcopy(BeautifulSoup("<a></a>"))
+ copied = deepcopy(self.soup)
+ self.assertEqual(copied.decode(), self.soup.decode())
+
+ def testUnicodePickle(self):
+ import cPickle as pickle
+ html = "<b>" + chr(0xc3) + "</b>"
+ soup = BeautifulSoup(html)
+ dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.decode(), soup.decode())
+
+
+class WriteOnlyCode(SoupTest):
+ "Testing the modification of the tree."
+
+ def testModifyAttributes(self):
+ soup = BeautifulSoup('<a id="1"></a>')
+ soup.a['id'] = 2
+ self.assertEqual(soup.decode(), '<a id="2"></a>')
+ del(soup.a['id'])
+ self.assertEqual(soup.decode(), '<a></a>')
+ soup.a['id2'] = 'foo'
+ self.assertEqual(soup.decode(), '<a id2="foo"></a>')
+
+ def testNewTagCreation(self):
+ "Makes sure tags don't step on each others' toes."
+ soup = BeautifulSoup()
+ a = Tag(soup, 'a')
+ ol = Tag(soup, 'ol')
+ a['href'] = 'http://foo.com/'
+ self.assertRaises(KeyError, lambda : ol['href'])
+
+ def testTagReplacement(self):
+ # Make sure you can replace an element with itself.
+ text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
+ soup = BeautifulSoup(text)
+ c = soup.c
+ soup.c.replaceWith(c)
+ self.assertEquals(soup.decode(), text)
+
+ # A very simple case
+ soup = BeautifulSoup("<b>Argh!</b>")
+ soup.find(text="Argh!").replaceWith("Hooray!")
+ newText = soup.find(text="Hooray!")
+ b = soup.b
+ self.assertEqual(newText.previous, b)
+ self.assertEqual(newText.parent, b)
+ self.assertEqual(newText.previous.next, newText)
+ self.assertEqual(newText.next, None)
+
+ # A more complex case
+ soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
+ soup.b.insert(1, "Hooray!")
+ newText = soup.find(text="Hooray!")
+ self.assertEqual(newText.previous, "Argh!")
+ self.assertEqual(newText.previous.next, newText)
+
+ self.assertEqual(newText.previousSibling, "Argh!")
+ self.assertEqual(newText.previousSibling.nextSibling, newText)
+
+ self.assertEqual(newText.nextSibling, None)
+ self.assertEqual(newText.next, soup.c)
+
+ text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
+ soup = BeautifulSoup(text)
+ no, show = soup.findAll('b')
+ show.replaceWith(no)
+ self.assertEquals(soup.decode(), "<html>There's business like <b>no</b> business</html>")
+
+ # Even more complex
+ soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
+ tag = Tag(soup, 'magictag')
+ tag.insert(0, "the")
+ soup.a.insert(1, tag)
+
+ b = soup.b
+ c = soup.c
+ theText = tag.find(text=True)
+ findText = b.find(text="Find")
+
+ self.assertEqual(findText.next, tag)
+ self.assertEqual(tag.previous, findText)
+ self.assertEqual(b.nextSibling, tag)
+ self.assertEqual(tag.previousSibling, b)
+ self.assertEqual(tag.nextSibling, c)
+ self.assertEqual(c.previousSibling, tag)
+
+ self.assertEqual(theText.next, c)
+ self.assertEqual(c.previous, theText)
+
+ # Aand... incredibly complex.
+ soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
+ f = soup.f
+ a = soup.a
+ c = soup.c
+ e = soup.e
+ weText = a.find(text="We")
+ soup.b.replaceWith(soup.f)
+ self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
+
+ self.assertEqual(f.previous, weText)
+ self.assertEqual(weText.next, f)
+ self.assertEqual(f.previousSibling, weText)
+ self.assertEqual(f.nextSibling, None)
+ self.assertEqual(weText.nextSibling, f)
+
+ def testAppend(self):
+ doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
+ soup = BeautifulSoup(doc)
+ second_para = soup('p')[1]
+ bold = soup.find('b')
+ soup('p')[1].append(soup.find('b'))
+ self.assertEqual(bold.parent, second_para)
+ self.assertEqual(soup.decode(),
+ "<p>Don't leave me .</p> "
+ "<p>Don't leave me.<b>here</b></p>")
+
+ def testTagExtraction(self):
+ # A very simple case
+ text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
+ soup = BeautifulSoup(text)
+ extracted = soup.find("div", id="nav").extract()
+ self.assertEqual(soup.decode(), "<html>Real content here.</html>")
+ self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
+
+ # A simple case, a more complex test.
+ text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
+ soup = BeautifulStoneSoup(text)
+ doc = soup.doc
+ numbers, roman, letters = soup("a")
+
+ self.assertEqual(roman.parent, doc)
+ oldPrevious = roman.previous
+ endOfThisTag = roman.nextSibling.previous
+ self.assertEqual(oldPrevious, "2")
+ self.assertEqual(roman.next, "i")
+ self.assertEqual(endOfThisTag, "ii")
+ self.assertEqual(roman.previousSibling, numbers)
+ self.assertEqual(roman.nextSibling, letters)
+
+ roman.extract()
+ self.assertEqual(roman.parent, None)
+ self.assertEqual(roman.previous, None)
+ self.assertEqual(roman.next, "i")
+ self.assertEqual(letters.previous, '2')
+ self.assertEqual(roman.previousSibling, None)
+ self.assertEqual(roman.nextSibling, None)
+ self.assertEqual(endOfThisTag.next, None)
+ self.assertEqual(roman.b.contents[0].next, None)
+ self.assertEqual(numbers.nextSibling, letters)
+ self.assertEqual(letters.previousSibling, numbers)
+ self.assertEqual(len(doc.contents), 2)
+ self.assertEqual(doc.contents[0], numbers)
+ self.assertEqual(doc.contents[1], letters)
+
+ # A more complex case.
+ text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
+ soup = BeautifulStoneSoup(text)
+ one = soup.find(text="1")
+ three = soup.find(text="3")
+ toExtract = soup.b
+ soup.b.extract()
+ self.assertEqual(one.next, three)
+ self.assertEqual(three.previous, one)
+ self.assertEqual(one.parent.nextSibling, three)
+ self.assertEqual(three.previousSibling, soup.a)
+
+class TheManWithoutAttributes(SoupTest):
+ "Test attribute access"
+
+ def testHasKey(self):
+ text = "<foo attr='bar'>"
+ self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
+
+class QuoteMeOnThat(SoupTest):
+ "Test quoting"
+ def testQuotedAttributeValues(self):
+ self.assertSoupEquals("<foo attr='bar'></foo>",
+ '<foo attr="bar"></foo>')
+
+ text = """<foo attr='bar "brawls" happen'>a</foo>"""
+ soup = BeautifulSoup(text)
+ self.assertEquals(soup.decode(), text)
+
+ soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
+ newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
+ self.assertSoupEquals(soup.decode(), newText)
+
+ self.assertSoupEquals('<this is="really messed up & stuff">',
+ '<this is="really messed up & stuff"></this>')
+
+
+
+class YoureSoLiteral(SoupTest):
+ "Test literal mode."
+ def testLiteralMode(self):
+ text = "<script>if (i<imgs.length)</script><b>Foo</b>"
+ soup = BeautifulSoup(text)
+ self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
+ self.assertEqual(soup.b.contents[0], "Foo")
+
+ def testTextArea(self):
+ text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
+ soup = BeautifulSoup(text)
+ self.assertEqual(soup.textarea.contents[0],
+ "<b>This is an example of an HTML tag</b><&<&")
+
+class OperatorOverload(SoupTest):
+ "Our operators do it all! Call now!"
+
+ def testTagNameAsFind(self):
+ "Tests that referencing a tag name as a member delegates to find()."
+ soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
+ self.assertEqual(soup.b.i, soup.find('b').find('i'))
+ self.assertEqual(soup.b.i.string, 'bar')
+ self.assertEqual(soup.b['id'], '1')
+ self.assertEqual(soup.b.contents[0], 'foo')
+ self.assert_(not soup.a)
+
+ #Test the .fooTag variant of .foo.
+ self.assertEqual(soup.bTag.iTag.string, 'bar')
+ self.assertEqual(soup.b.iTag.string, 'bar')
+ self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
+
+class NestableEgg(SoupTest):
+ """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
+
+ def testParaInsideBlockquote(self):
+ soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
+ self.assertEqual(soup.blockquote.p.b.string, 'Foo')
+ self.assertEqual(soup.blockquote.b.string, 'Foo')
+ self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
+
+ def testNestedTables(self):
+ text = """<table id="1"><tr><td>Here's another table:
+ <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
+ soup = BeautifulSoup(text)
+ self.assertEquals(soup.table.table.td.string, 'Juicy text')
+ self.assertEquals(len(soup.findAll('table')), 2)
+ self.assertEquals(len(soup.table.findAll('table')), 1)
+ self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
+ 'table')
+
+ text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
+ soup = BeautifulSoup(text)
+ self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
+
+ text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
+ <tfoot><tr>Baz</tr></tfoot></table>"""
+ soup = BeautifulSoup(text)
+ self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
+
+ def testBadNestedTables(self):
+ soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
+ self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
+
+class CleanupOnAisleFour(SoupTest):
+ """Here we test cleanup of text that breaks HTMLParser or is just
+ obnoxious."""
+
+ def testSelfClosingtag(self):
+ self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
+ '<br />')
+
+ self.assertSoupEquals('<p>test1<br/>test2</p>',
+ '<p>test1<br />test2</p>')
+
+ text = '<p>test1<selfclosing>test2'
+ soup = BeautifulStoneSoup(text)
+ self.assertEqual(soup.decode(),
+ '<p>test1<selfclosing>test2</selfclosing></p>')
+
+ soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
+ self.assertEqual(soup.decode(),
+ '<p>test1<selfclosing />test2</p>')
+
+ def testSelfClosingTagOrNot(self):
+ text = "<item><link>http://foo.com/</link></item>"
+ self.assertEqual(BeautifulStoneSoup(text).decode(), text)
+ self.assertEqual(BeautifulSoup(text).decode(),
+ '<item><link />http://foo.com/</item>')
+
+ def testBooleanAttributes(self):
+ text = "<td nowrap>foo</td>"
+ self.assertSoupEquals(text, text)
+
+ def testCData(self):
+ xml = "<root>foo<![CDATA[foobar]]>bar</root>"
+ self.assertSoupEquals(xml, xml)
+ r = re.compile("foo.*bar")
+ soup = BeautifulSoup(xml)
+ self.assertEquals(soup.find(text=r).string, "foobar")
+ self.assertEquals(soup.find(text=r).__class__, CData)
+
+ def testComments(self):
+ xml = "foo<!--foobar-->baz"
+ self.assertSoupEquals(xml)
+ r = re.compile("foo.*bar")
+ soup = BeautifulSoup(xml)
+ self.assertEquals(soup.find(text=r).string, "foobar")
+ self.assertEquals(soup.find(text="foobar").__class__, Comment)
+
+ def testDeclaration(self):
+ xml = "foo<!DOCTYPE foobar>baz"
+ self.assertSoupEquals(xml)
+ r = re.compile(".*foo.*bar")
+ soup = BeautifulSoup(xml)
+ text = "DOCTYPE foobar"
+ self.assertEquals(soup.find(text=r).string, text)
+ self.assertEquals(soup.find(text=text).__class__, Declaration)
+
+ namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
+ '<html>foo</html>')
+ soup = BeautifulSoup(namespaced_doctype)
+ self.assertEquals(soup.contents[0],
+ 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
+ self.assertEquals(soup.html.contents[0], 'foo')
+
+ def testEntityConversions(self):
+ text = "<<sacré bleu!>>"
+ soup = BeautifulStoneSoup(text)
+ self.assertSoupEquals(text)
+
+ xmlEnt = BeautifulStoneSoup.XML_ENTITIES
+ htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
+ xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
+
+ soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
+ self.assertEquals(soup.decode(), "<<sacré bleu!>>")
+
+ soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
+ self.assertEquals(soup.decode(), "<<sacré bleu!>>")
+
+ soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
+ self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")
+
+ # Make sure the "XML", "HTML", and "XHTML" settings work.
+ text = "<™'"
+ soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
+ self.assertEquals(soup.decode(), u"<™'")
+
+ soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
+ self.assertEquals(soup.decode(), u"<\u2122'")
+
+ soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
+ self.assertEquals(soup.decode(), u"<\u2122'")
+
+ def testNonBreakingSpaces(self):
+ soup = BeautifulSoup("<a> </a>",
+ convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
+ self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
+
+ def testWhitespaceInDeclaration(self):
+ self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
+
+ def testJunkInDeclaration(self):
+ self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
+
+ def testIncompleteDeclaration(self):
+ self.assertSoupEquals('a<!b <p>c')
+
+ def testEntityReplacement(self):
+ self.assertSoupEquals('<b>hello there</b>')
+
+ def testEntitiesInAttributeValues(self):
+ self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
+ encoding='utf-8')
+ self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
+ encoding='utf-8')
+
+ soup = BeautifulSoup('<x t=">™">',
+ convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
+ self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>')
+
+ uri = "http://crummy.com?sacré&bleu"
+ link = '<a href="%s"></a>' % uri
+
+ soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
+ self.assertEquals(soup.decode(),
+ link.replace("é", u"\xe9"))
+
+ uri = "http://crummy.com?sacré&bleu"
+ link = '<a href="%s"></a>' % uri
+ soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
+ self.assertEquals(soup.a['href'],
+ uri.replace("é", u"\xe9"))
+
+ def testNakedAmpersands(self):
+ html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
+ soup = BeautifulStoneSoup("AT&T ", **html)
+ self.assertEquals(soup.decode(), 'AT&T ')
+
+ nakedAmpersandInASentence = "AT&T was Ma Bell"
+ soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
+ self.assertEquals(soup.decode(), \
+ nakedAmpersandInASentence.replace('&','&'))
+
+ invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
+ validURL = invalidURL.replace('&','&')
+ soup = BeautifulStoneSoup(invalidURL)
+ self.assertEquals(soup.decode(), validURL)
+
+ soup = BeautifulStoneSoup(validURL)
+ self.assertEquals(soup.decode(), validURL)
+
+
+class EncodeRed(SoupTest):
+ """Tests encoding conversion, Unicode conversion, and Microsoft
+ smart quote fixes."""
+
+ def testUnicodeDammitStandalone(self):
+ markup = "<foo>\x92</foo>"
+ dammit = UnicodeDammit(markup)
+ self.assertEquals(dammit.unicode, "<foo>’</foo>")
+
+ hebrew = "\xed\xe5\xec\xf9"
+ dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
+ self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
+ self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
+
+ def testGarbageInGarbageOut(self):
+ ascii = "<foo>a</foo>"
+ asciiSoup = BeautifulStoneSoup(ascii)
+ self.assertEquals(ascii, asciiSoup.decode())
+
+ unicodeData = u"<foo>\u00FC</foo>"
+ utf8 = unicodeData.encode("utf-8")
+ self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
+
+ unicodeSoup = BeautifulStoneSoup(unicodeData)
+ self.assertEquals(unicodeData, unicodeSoup.decode())
+ self.assertEquals(unicodeSoup.foo.string, u'\u00FC')
+
+ utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
+ self.assertEquals(utf8, utf8Soup.encode('utf-8'))
+ self.assertEquals(utf8Soup.originalEncoding, "utf-8")
+
+ utf8Soup = BeautifulStoneSoup(unicodeData)
+ self.assertEquals(utf8, utf8Soup.encode('utf-8'))
+ self.assertEquals(utf8Soup.originalEncoding, None)
+
+
+ def testHandleInvalidCodec(self):
+ for bad_encoding in ['.utf8', '...', 'utF---16.!']:
+ soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"),
+ fromEncoding=bad_encoding)
+ self.assertEquals(soup.originalEncoding, 'utf-8')
+
+ def testUnicodeSearch(self):
+ html = u'<html><body><h1>Räksmörgås</h1></body></html>'
+ soup = BeautifulSoup(html)
+ self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
+
+ def testRewrittenXMLHeader(self):
+ euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
+ utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
+ soup = BeautifulStoneSoup(euc_jp)
+ if soup.originalEncoding != "euc-jp":
+ raise Exception("Test failed when parsing euc-jp document. "
+ "If you're running Python >=2.4, or you have "
+ "cjkcodecs installed, this is a real problem. "
+ "Otherwise, ignore it.")
+
+ self.assertEquals(soup.originalEncoding, "euc-jp")
+ self.assertEquals(soup.renderContents('utf-8'), utf8)
+
+ old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
+ new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
+ self.assertSoupEquals(old_text, new_text)
+
+ def testRewrittenMetaTag(self):
+ no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
+ soup = BeautifulSoup(no_shift_jis_html)
+
+ # Beautiful Soup used to try to rewrite the meta tag even if the
+ # meta tag got filtered out by the strainer. This test makes
+ # sure that doesn't happen.
+ strainer = SoupStrainer('pre')
+ soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
+ self.assertEquals(soup.contents[0].name, 'pre')
+
+ meta_tag = ('<meta content="text/html; charset=x-sjis" '
+ 'http-equiv="Content-type" />')
+ shift_jis_html = (
+ '<html><head>\n%s\n'
+ '<meta http-equiv="Content-language" content="ja" />'
+ '</head><body><pre>\n'
+ '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+ '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+ '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
+ '</pre></body></html>') % meta_tag
+ soup = BeautifulSoup(shift_jis_html)
+ if soup.originalEncoding != "shift-jis":
+ raise Exception("Test failed when parsing shift-jis document "
+ "with meta tag '%s'."
+ "If you're running Python >=2.4, or you have "
+ "cjkcodecs installed, this is a real problem. "
+ "Otherwise, ignore it." % meta_tag)
+ self.assertEquals(soup.originalEncoding, "shift-jis")
+
+ content_type_tag = soup.meta['content']
+ self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
+ 'charset=%SOUP-ENCODING%')
+ content_type = str(soup.meta)
+ index = content_type.find('charset=')
+ self.assertEqual(content_type[index:index+len('charset=utf8')+1],
+ 'charset=utf-8')
+ content_type = soup.meta.encode('shift-jis')
+ index = content_type.find('charset=')
+ self.assertEqual(content_type[index:index+len('charset=shift-jis')],
+ 'charset=shift-jis'.encode())
+
+ self.assertEquals(soup.encode('utf-8'), (
+ '<html><head>\n'
+ '<meta content="text/html; charset=utf-8" '
+ 'http-equiv="Content-type" />\n'
+ '<meta http-equiv="Content-language" content="ja" />'
+ '</head><body><pre>\n'
+ '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
+ '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
+ '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
+ '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
+ '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
+ '</pre></body></html>'))
+ self.assertEquals(soup.encode("shift-jis"),
+ shift_jis_html.replace('x-sjis'.encode(),
+ 'shift-jis'.encode()))
+
+ isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
+ soup = BeautifulSoup(isolatin)
+
+ utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
+ utf8 = utf8.replace("\xe9", "\xc3\xa9")
+ self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')
+
+ def testHebrew(self):
+ iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
+ utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
+ soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
+ self.assertEquals(soup.encode('utf-8'), utf8)
+
+ def testSmartQuotesNotSoSmartAnymore(self):
+ self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
+ '‘Foo’ <!--blah-->')
+
+ def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
+ smartQuotes = "Il a dit, \x8BSacré bleu!\x9b"
+ soup = BeautifulSoup(smartQuotes)
+ self.assertEquals(soup.decode(),
+ 'Il a dit, ‹Sacré bleu!›')
+ soup = BeautifulSoup(smartQuotes, convertEntities="html")
+ self.assertEquals(soup.encode('utf-8'),
+ 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
+
+ def testDontSeeSmartQuotesWhereThereAreNone(self):
+ utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
+ self.assertSoupEquals(utf_8, encoding='utf-8')
+
+
+class Whitewash(SoupTest):
+ """Test whitespace preservation."""
+
+ def testPreservedWhitespace(self):
+ self.assertSoupEquals("<pre> </pre>")
+ self.assertSoupEquals("<pre> woo </pre>")
+
+ def testCollapsedWhitespace(self):
+ self.assertSoupEquals("<p> </p>", "<p> </p>")
+
+
+if __name__ == '__main__':
+ unittest.main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/HtmlSanitizer.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,575 @@
+# -*- coding: UTF-8 -*-
+"""
+some input filters, for regularising the html fragments from screen scraping and
+browser-based editors into some semblance of sanity
+
+TODO: turn the messy setting[method_name]=True filter syntax into a list of cleaning methods to invoke, so that they can be invoked in a specific order and multiple times.
+
+AUTHORS:
+Dan MacKinlay - https://launchpad.net/~dan-possumpalace
+Collin Grady - http://launchpad.net/~collin-collingrady
+Andreas Gustafsson - https://bugs.launchpad.net/~gson
+HÃ¥kan W - https://launchpad.net/~hwaara-gmail
+"""
+
+import BeautifulSoup
+import re
+import sys
+
+# Python 2.4 compatibility
+try: any
+except NameError:
+ def any(iterable):
+ for element in iterable:
+ if element:
+ return True
+ return False
+
+"""
+html5lib compatibility. Basically, we need to know that this still works whether html5lib
+is imported or not. Should run complete suites of tests for both possible configs -
+or test in virtual environments, but for now a basic sanity check will do.
+>>> if html5:
+>>> c=Cleaner(html5=False)
+>>> c(u'<p>foo</p>)
+u'<p>foo</p>'
+"""
+try:
+ import html5lib
+ from html5lib import sanitizer, treebuilders
+ parser = html5lib.HTMLParser(
+ tree=treebuilders.getTreeBuilder("beautifulsoup"),
+ tokenizer=sanitizer.HTMLSanitizer
+ )
+ html5 = True
+except ImportError:
+ html5 = False
+
+ANTI_JS_RE=re.compile('j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:', re.IGNORECASE)
+#These tags and attrs are sufficently liberal to let microformats through...
+#it ruthlessly culls all the rdf, dublin core metadata and so on.
+valid_tags = dict.fromkeys('p i em strong b u a h1 h2 h3 pre abbr br img dd dt ol ul li span sub sup ins del blockquote table tr td th address cite'.split()) #div?
+valid_attrs = dict.fromkeys('href src rel title'.split())
+valid_schemes = dict.fromkeys('http https'.split())
+elem_map = {'b' : 'strong', 'i': 'em'}
+attrs_considered_links = dict.fromkeys("src href".split()) #should include
+#courtesy http://developer.mozilla.org/en/docs/HTML:Block-level_elements
+block_elements = dict.fromkeys(["p", "h1","h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table"])
+
+#convenient default filter lists.
+paranoid_filters = ["strip_comments", "strip_tags", "strip_attrs",
+ "strip_schemes", "rename_tags", "wrap_string", "strip_empty_tags", "strip_empty_tags", ]
+complete_filters = ["strip_comments", "rename_tags", "strip_tags", "strip_attrs",
+ "strip_cdata", "strip_schemes", "wrap_string", "strip_empty_tags", "rebase_links", "reparse"]
+
+#set some conservative default string processings
+default_settings = {
+ "filters" : paranoid_filters,
+ "block_elements" : block_elements, #xml or None for a more liberal version
+ "convert_entities" : "html", #xml or None for a more liberal version
+ "valid_tags" : valid_tags,
+ "valid_attrs" : valid_attrs,
+ "valid_schemes" : valid_schemes,
+ "attrs_considered_links" : attrs_considered_links,
+ "elem_map" : elem_map,
+ "wrapping_element" : "p",
+ "auto_clean" : False,
+ "original_url" : "",
+ "new_url" : "",
+ "html5" : html5
+}
+#processes I'd like but haven't implemented
+#"encode_xml_specials", "ensure complete xhtml doc", "ensure_xhtml_fragment_only"
+# and some handling of permitted namespaces for tags. for RDF, say. maybe.
+
+XML_ENTITIES = { u"'" : u"'",
+ u'"' : u""",
+ u"&" : u"&",
+ u"<" : u"<",
+ u">" : u">"
+ }
+LINE_EXTRACTION_RE = re.compile(".+", re.MULTILINE)
+BR_EXTRACTION_RE = re.compile("</?br ?/?>", re.MULTILINE)
+
+class Stop:
+ """
+ handy class that we use as a stop input for our state machine in lieu of falling
+ off the end of lists
+ """
+ pass
+
+
+class Cleaner(object):
+ r"""
+ powerful and slow arbitrary HTML sanitisation. can deal (i hope) with most XSS
+ vectors and layout-breaking badness.
+ Probably overkill for content from trusted sources; defaults are accordingly
+ set to be paranoid.
+ >>> bad_html = '<p style="forbidden markup"><!-- XSS attach -->content</p'
+ >>> good_html = u'<p>content</p>'
+ >>> c = Cleaner()
+ >>> c.string = bad_html
+ >>> c.clean()
+ >>> c.string == good_html
+ True
+
+ Also supports shorthand syntax:
+ >>> c = Cleaner()
+ >>> c(bad_html) == c(good_html)
+ True
+ """
+
+ def __init__(self, string_or_soup="", *args, **kwargs):
+ self.settings=default_settings.copy()
+ self.settings.update(kwargs)
+ if args :
+ self.settings['filters'] = args
+ super(Cleaner, self).__init__(string_or_soup, *args, **kwargs)
+ self.string = string_or_soup
+
+ def __call__(self, string = None, **kwargs):
+ """
+ convenience method allowing one-step calling of an instance and returning
+ a cleaned string.
+
+ TODO: make this method preserve internal state- perhaps by creating a new
+ instance.
+
+ >>> s = 'input string'
+ >>> c1 = Cleaner(s, auto_clean=True)
+ >>> c2 = Cleaner("")
+ >>> c1.string == c2(s)
+ True
+
+ """
+ self.settings.update(kwargs)
+ if not string == None :
+ self.string = string
+ self.clean()
+ return self.string
+
+ def _set_contents(self, string_or_soup):
+ if isinstance(string_or_soup, BeautifulSoup.BeautifulSoup) :
+ self._set_soup(string_or_soup)
+ else :
+ self._set_string(string_or_soup)
+
+ def _set_string(self, html_fragment_string):
+ if self.settings['html5']:
+ s = parser.parse(html_fragment_string).body
+ else:
+ s = BeautifulSoup.BeautifulSoup(
+ html_fragment_string,
+ convertEntities=self.settings['convert_entities'])
+ self._set_soup(s)
+
+ def _set_soup(self, soup):
+ """
+ Does all the work of set_string, but bypasses a potential autoclean to avoid
+ loops upon internal string setting ops.
+ """
+ self._soup = BeautifulSoup.BeautifulSoup(
+ '<rootrootroot></rootrootroot>'
+ )
+ self.root=self._soup.contents[0]
+
+ if len(soup.contents) :
+ backwards_soup = [i for i in soup.contents]
+ backwards_soup.reverse()
+ else :
+ backwards_soup = []
+ for i in backwards_soup :
+ i.extract()
+ self.root.insert(0, i)
+
+ def set_string(self, string) :
+ ur"""
+ sets the string to process and does the necessary input encoding too
+ really intended to be invoked as a property.
+ note the godawful rootrootroot element which we need because the
+ BeautifulSoup object has all the same methods as a Tag, but
+ behaves differently, silently failing on some inserts and appends
+
+ >>> c = Cleaner(convert_entities="html")
+ >>> c.string = 'é'
+ >>> c.string
+ u'\xe9'
+ >>> c = Cleaner(convert_entities="xml")
+ >>> c.string = u'é'
+ >>> c.string
+ u'é'
+ """
+ self._set_string(string)
+ if len(string) and self.settings['auto_clean'] : self.clean()
+
+ def get_string(self):
+ return unicode(self.root.renderContents())
+
+ string = property(get_string, set_string)
+
+ def clean(self):
+ """
+ invoke all cleaning processes stipulated in the settings
+ """
+ for method in self.settings['filters'] :
+ try :
+ getattr(self, method)()
+ except NotImplementedError :
+ sys.stderr.write('Warning, called unimplemented method %s' % method + '\n')
+
+ def strip_comments(self):
+ r"""
+ XHTML comments are used as an XSS attack vector. they must die.
+
+ >>> c = Cleaner("", "strip_comments")
+ >>> c('<p>text<!-- comment --> More text</p>')
+ u'<p>text More text</p>'
+ """
+ for comment in self.root.findAll(
+ text = lambda text: isinstance(text, BeautifulSoup.Comment)):
+ comment.extract()
+
+ def strip_cdata(self):
+ for cdata in self.root.findAll(
+ text = lambda text: isinstance(text, BeautifulSoup.CData)):
+ cdata.extract()
+
+ def strip_tags(self):
+ r"""
+ ill-considered tags break our layout. they must die.
+ >>> c = Cleaner("", "strip_tags", auto_clean=True)
+ >>> c.string = '<div>A <strong>B C</strong></div>'
+ >>> c.string
+ u'A <strong>B C</strong>'
+ >>> c.string = '<div>A <div>B C</div></div>'
+ >>> c.string
+ u'A B C'
+ >>> c.string = '<div>A <br /><div>B C</div></div>'
+ >>> c.string
+ u'A <br />B C'
+ >>> c.string = '<p>A <div>B C</div></p>'
+ >>> c.string
+ u'<p>A B C</p>'
+ >>> c.string = 'A<div>B<div>C<div>D</div>E</div>F</div>G'
+ >>> c.string
+ u'ABCDEFG'
+ >>> c.string = '<div>B<div>C<div>D</div>E</div>F</div>'
+ >>> c.string
+ u'BCDEF'
+ """
+ # Beautiful Soup doesn't support dynamic .findAll results when the tree is
+ # modified in place.
+ # going backwards doesn't seem to help.
+ # so find one at a time
+ while True :
+ next_bad_tag = self.root.find(
+ lambda tag : not tag.name in (self.settings['valid_tags'])
+ )
+ if next_bad_tag :
+ self.disgorge_elem(next_bad_tag)
+ else:
+ break
+
+ def strip_attrs(self):
+ """
+ preserve only those attributes we need in the soup
+ >>> c = Cleaner("", "strip_attrs")
+ >>> c('<div title="v" bad="v">A <strong title="v" bad="v">B C</strong></div>')
+ u'<div title="v">A <strong title="v">B C</strong></div>'
+ """
+ for tag in self.root.findAll(True):
+ tag.attrs = [(attr, val) for attr, val in tag.attrs
+ if attr in self.settings['valid_attrs']]
+
+ def _all_links(self):
+ """
+ finds all tags with link attributes sequentially. safe against modification
+ of said attributes in-place.
+ """
+ start = self.root
+ while True:
+ tag = start.findNext(
+ lambda tag : any(
+ [(tag.get(i) for i in self.settings['attrs_considered_links'])]
+ ))
+ if tag:
+ start = tag
+ yield tag
+ else :
+ break
+
+ def strip_schemes(self):
+ """
+ >>> c = Cleaner("", "strip_schemes")
+ >>> c('<img src="javascript:alert();" />')
+ u'<img />'
+ >>> c('<a href="javascript:alert();">foo</a>')
+ u'<a>foo</a>'
+ """
+ for tag in self._all_links() :
+ for key in self.settings['attrs_considered_links'] :
+ scheme_bits = tag.get(key, u"").split(u':',1)
+ if len(scheme_bits) == 1 :
+ pass #relative link
+ else:
+ if not scheme_bits[0] in self.settings['valid_schemes'] :
+ del(tag[key])
+
+ def br_to_p(self):
+ """
+ >>> c = Cleaner("", "br_to_p")
+ >>> c('<p>A<br />B</p>')
+ u'<p>A</p><p>B</p>'
+ >>> c('A<br />B')
+ u'<p>A</p><p>B</p>'
+ """
+ block_elems = self.settings['block_elements']
+ block_elems['br'] = None
+ block_elems['p'] = None
+
+ while True :
+ next_br = self.root.find('br')
+ if not next_br: break
+ parent = next_br.parent
+ self.wrap_string('p', start_at=parent, block_elems = block_elems)
+ while True:
+ useless_br=parent.find('br', recursive=False)
+ if not useless_br: break
+ useless_br.extract()
+ if parent.name == 'p':
+ self.disgorge_elem(parent)
+
+ def rename_tags(self):
+ """
+ >>> c = Cleaner("", "rename_tags", elem_map={'i': 'em'})
+ >>> c('<b>A<i>B</i></b>')
+ u'<b>A<em>B</em></b>'
+ """
+ for tag in self.root.findAll(self.settings['elem_map']) :
+ tag.name = self.settings['elem_map'][tag.name]
+
+ def wrap_string(self, wrapping_element = None, start_at=None, block_elems=None):
+ """
+ takes an html fragment, which may or may not have a single containing element,
+ and guarantees what the tag name of the topmost elements are.
+ TODO: is there some simpler way than a state machine to do this simple thing?
+ >>> c = Cleaner("", "wrap_string")
+ >>> c('A <strong>B C</strong>D')
+ u'<p>A <strong>B C</strong>D</p>'
+ >>> c('A <p>B C</p>D')
+ u'<p>A </p><p>B C</p><p>D</p>'
+ """
+ if not start_at : start_at = self.root
+ if not block_elems : block_elems = self.settings['block_elements']
+ e = (wrapping_element or self.settings['wrapping_element'])
+ paragraph_list = []
+ children = [elem for elem in start_at.contents]
+ children.append(Stop())
+
+ last_state = 'block'
+ paragraph = BeautifulSoup.Tag(self._soup, e)
+
+ for node in children :
+ if isinstance(node, Stop) :
+ state = 'end'
+ elif hasattr(node, 'name') and node.name in block_elems:
+ state = 'block'
+ else:
+ state = 'inline'
+
+ if last_state == 'block' and state == 'inline':
+ #collate inline elements
+ paragraph = BeautifulSoup.Tag(self._soup, e)
+
+ if state == 'inline' :
+ paragraph.append(node)
+
+ if ((state <> 'inline') and last_state == 'inline') :
+ paragraph_list.append(paragraph)
+
+ if state == 'block' :
+ paragraph_list.append(node)
+
+ last_state = state
+
+ #can't use append since it doesn't work on empty elements...
+ paragraph_list.reverse()
+ for paragraph in paragraph_list:
+ start_at.insert(0, paragraph)
+
+ def strip_empty_tags(self):
+ """
+ strip out all empty tags
+ TODO: depth-first search
+ >>> c = Cleaner("", "strip_empty_tags")
+ >>> c('<p>A</p><p></p><p>B</p><p></p>')
+ u'<p>A</p><p>B</p>'
+ >>> c('<p><a></a></p>')
+ u'<p></p>'
+ """
+ tag = self.root
+ while True:
+ next_tag = tag.findNext(True)
+ if not next_tag: break
+ if next_tag.contents or next_tag.attrs:
+ tag = next_tag
+ continue
+ next_tag.extract()
+
+ def rebase_links(self, original_url="", new_url ="") :
+ if not original_url : original_url = self.settings.get('original_url', '')
+ if not new_url : new_url = self.settings.get('new_url', '')
+ raise NotImplementedError
+
+ # Because of its internal character set handling,
+ # the following will not work in Beautiful soup and is hopefully redundant.
+ # def encode_xml_specials(self, original_url="", new_url ="") :
+ # """
+ # BeautifulSoup will let some dangerous xml entities hang around
+ # in the navigable strings. destroy all monsters.
+ # >>> c = Cleaner(auto_clean=True, encode_xml_specials=True)
+ # >>> c('<<<<<')
+ # u'<<<<'
+ # """
+ # for string in self.root.findAll(text=True) :
+ # sys.stderr.write("root" +"\n")
+ # sys.stderr.write(str(self.root) +"\n")
+ # sys.stderr.write("parent" +"\n")
+ # sys.stderr.write(str(string.parent) +"\n")
+ # new_string = unicode(string)
+ # sys.stderr.write(string +"\n")
+ # for special_char in XML_ENTITIES.keys() :
+ # sys.stderr.write(special_char +"\n")
+ # string.replaceWith(
+ # new_string.replace(special_char, XML_ENTITIES[special_char])
+ # )
+
+
+ def disgorge_elem(self, elem):
+ """
+ remove the given element from the soup and replaces it with its own contents
+ actually tricky, since you can't replace an element with an list of elements
+ using replaceWith
+ >>> disgorgeable_string = '<body>A <em>B</em> C</body>'
+ >>> c = Cleaner()
+ >>> c.string = disgorgeable_string
+ >>> elem = c._soup.find('em')
+ >>> c.disgorge_elem(elem)
+ >>> c.string
+ u'<body>A B C</body>'
+ >>> c.string = disgorgeable_string
+ >>> elem = c._soup.find('body')
+ >>> c.disgorge_elem(elem)
+ >>> c.string
+ u'A <em>B</em> C'
+ >>> c.string = '<div>A <div id="inner">B C</div></div>'
+ >>> elem = c._soup.find(id="inner")
+ >>> c.disgorge_elem(elem)
+ >>> c.string
+ u'<div>A B C</div>'
+ """
+ if elem == self.root :
+ raise AttributeError, "Can't disgorge root"
+
+ # With in-place modification, BeautifulSoup occasionally can return
+ # elements that think they are orphans
+ # this lib is full of workarounds, but it's worth checking
+ parent = elem.parent
+ if parent == None:
+ raise AttributeError, "AAAAAAAAGH! NO PARENTS! DEATH!"
+
+ i = None
+ for i in range(len(parent.contents)) :
+ if parent.contents[i] == elem :
+ index = i
+ break
+
+ elem.extract()
+
+ #the proceeding method breaks horribly, sporadically.
+ # for i in range(len(elem.contents)) :
+ # elem.contents[i].extract()
+ # parent.contents.insert(index+i, elem.contents[i])
+ # return
+ self._safe_inject(parent, index, elem.contents)
+
+ def _safe_inject(self, dest, dest_index, node_list):
+ #BeautifulSoup result sets look like lists but don't behave right
+ # i.e. empty ones are still True,
+ if not len(node_list) : return
+ node_list = [i for i in node_list]
+ node_list.reverse()
+ for i in node_list :
+ dest.insert(dest_index, i)
+
+
+class Htmlator(object) :
+ """
+ converts a string into a series of html paragraphs
+ """
+ settings = {
+ "encode_xml_specials" : True,
+ "is_plaintext" : True,
+ "convert_newlines" : False,
+ "make_links" : True,
+ "auto_convert" : False,
+ "valid_schemes" : valid_schemes,
+ }
+ def __init__(self, string = "", **kwargs):
+ self.settings.update(kwargs)
+ super(Htmlator, self).__init__(string, **kwargs)
+ self.string = string
+
+ def _set_string(self, string):
+ self.string = string
+ if self.settings['auto_convert'] : self.convert()
+
+ def _get_string(self):
+ return unicode(self._soup)
+
+ string = property(_get_string, _set_string)
+
+ def __call__(self, string):
+ """
+ convenience method supporting one-step calling of an instance
+ as a string cleaning function
+ """
+ self.string = string
+ self.convert()
+ return self.string
+
+ def convert(self):
+ for method in ["encode_xml_specials", "convert_newlines",
+ "make_links"] :
+ if self.settings(method) :
+ getattr(self, method)()
+
+ def encode_xml_specials(self) :
+ for char in XML_ENTITIES.keys() :
+ self.string.replace(char, XML_ENTITIES[char])
+
+ def make_links(self):
+ raise NotImplementedError
+
+ def convert_newlines(self) :
+ self.string = ''.join([
+ '<p>' + line + '</p>' for line in LINE_EXTRACTION_RE.findall(self.string)
+ ])
+
+def _test():
+ import doctest
+ doctest.testmod()
+
+if __name__ == "__main__":
+ _test()
+
+
+# def cast_input_to_soup(fn):
+# """
+# Decorate function to handle strings as BeautifulSoups transparently
+# """
+# def stringy_version(input, *args, **kwargs) :
+# if not isinstance(input,BeautifulSoup) :
+# input=BeautifulSoup(input)
+# return fn(input, *args, **kwargs)
+# return stringy_version
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/LICENSE-BeautifulSoup Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,32 @@
+Copyright (c) 2004-2009, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/LICENSE-HtmlSanitizer Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,23 @@
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the distribution.
+
+ 3. The names of the authors may not be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JCRAFT,
+INC. OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/app/main.py Mon May 25 23:42:15 2009 +0200
+++ b/app/main.py Tue May 26 02:37:39 2009 +0200
@@ -29,42 +29,7 @@
from google.appengine.ext.webapp import util
-
-# Remove the standard version of Django.
-for k in [k for k in sys.modules if k.startswith('django')]:
- del sys.modules[k]
-
-# Force sys.path to have our own directory first, in case we want to import
-# from it. This lets us replace the built-in Django
-sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
-
-sys.path.insert(0, os.path.abspath('django.zip'))
-
-ultimate_sys_path = None
-
-# Force Django to reload its settings.
-from django.conf import settings
-settings._target = None
-
-# Must set this env var before importing any part of Django
-os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
-
-import django.core.handlers.wsgi
-import django.core.signals
-import django.db
-
-# Log errors.
-def log_exception(*args, **kwds):
- """Function used for logging exceptions.
- """
- logging.exception('Exception in request:')
-
-# Log all exceptions detected by Django.
-django.core.signals.got_request_exception.connect(log_exception)
-
-# Unregister the rollback event handler.
-django.core.signals.got_request_exception.disconnect(
- django.db._rollback_on_exception)
+import gae_django
def profile_main_as_html():
@@ -117,15 +82,17 @@
def real_main():
"""Main program without profiling.
"""
- global ultimate_sys_path
- if ultimate_sys_path is None:
- ultimate_sys_path = list(sys.path)
- else:
- sys.path[:] = ultimate_sys_path
+ import django.core.handlers.wsgi
# Create a Django application for WSGI.
application = django.core.handlers.wsgi.WSGIHandler()
+ from soc.modules import callback
+ from soc.modules import core
+
+ callback.registerCore(core.Core())
+ callback.getCore().registerModuleCallbacks()
+
# Run the WSGI CGI handler with that application.
util.run_wsgi_app(application)
--- a/app/settings.py Mon May 25 23:42:15 2009 +0200
+++ b/app/settings.py Tue May 26 02:37:39 2009 +0200
@@ -100,6 +100,7 @@
os.path.join(ROOT_PATH, 'ghop', 'templates'),
os.path.join(ROOT_PATH, 'gsoc', 'templates'),
os.path.join(ROOT_PATH, 'soc', 'templates'),
+ os.path.join(ROOT_PATH, 'shell', 'templates'),
)
INSTALLED_APPS = (
@@ -109,3 +110,6 @@
# 'django.contrib.sessions',
# 'django.contrib.sites',
)
+
+MODULE_FMT = 'soc.modules.%s'
+MODULES = []
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/shell/README Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,17 @@
+An interactive, stateful AJAX shell that runs Python code on the server.
+
+Part of http://code.google.com/p/google-app-engine-samples/.
+
+May be run as a standalone app or in an existing app as an admin-only handler.
+Can be used for system administration tasks, as an interactive way to try out
+APIs, or as a debugging aid during development.
+
+The logging, os, sys, db, and users modules are imported automatically.
+
+Interpreter state is stored in the datastore so that variables, function
+definitions, and other values in the global and local namespaces can be used
+across commands.
+
+To use the shell in your app, copy shell.py, static/*, and templates/* into
+your app's source directory. Then, copy the URL handlers from app.yaml into
+your app.yaml.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/shell/shell.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,317 @@
+#!/usr/bin/python
+#
+# Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+An interactive, stateful AJAX shell that runs Python code on the server.
+
+Part of http://code.google.com/p/google-app-engine-samples/.
+
+May be run as a standalone app or in an existing app as an admin-only handler.
+Can be used for system administration tasks, as an interactive way to try out
+APIs, or as a debugging aid during development.
+
+The logging, os, sys, db, and users modules are imported automatically.
+
+Interpreter state is stored in the datastore so that variables, function
+definitions, and other values in the global and local namespaces can be used
+across commands.
+
+To use the shell in your app, copy shell.py, static/*, and templates/* into
+your app's source directory. Then, copy the URL handlers from app.yaml into
+your app.yaml.
+
+TODO: unit tests!
+"""
+
+import logging
+import new
+import os
+import pickle
+import sys
+import traceback
+import types
+import wsgiref.handlers
+
+from django.template import loader
+from google.appengine.api import users
+from google.appengine.ext import db
+from google.appengine.ext import webapp
+from google.appengine.ext.webapp import template
+
+import django.template
+import gae_django
+
+
+# Set to True if stack traces should be shown in the browser, etc.
+_DEBUG = True
+
+# The entity kind for shell sessions. Feel free to rename to suit your app.
+_SESSION_KIND = '_Shell_Session'
+
+# Types that can't be pickled.
+UNPICKLABLE_TYPES = (
+ types.ModuleType,
+ types.TypeType,
+ types.ClassType,
+ types.FunctionType,
+ )
+
+# Unpicklable statements to seed new sessions with.
+INITIAL_UNPICKLABLES = [
+ 'import logging',
+ 'import os',
+ 'import sys',
+ 'from google.appengine.ext import db',
+ 'from google.appengine.api import users',
+ ]
+
+
+class ShellSession(db.Model):
+ """A shell session. Stores the session's globals.
+
+ Each session globals is stored in one of two places:
+
+ If the global is picklable, it's stored in the parallel globals and
+ global_names list properties. (They're parallel lists to work around the
+ unfortunate fact that the datastore can't store dictionaries natively.)
+
+ If the global is not picklable (e.g. modules, classes, and functions), or if
+ it was created by the same statement that created an unpicklable global,
+ it's not stored directly. Instead, the statement is stored in the
+ unpicklables list property. On each request, before executing the current
+ statement, the unpicklable statements are evaluated to recreate the
+ unpicklable globals.
+
+ The unpicklable_names property stores all of the names of globals that were
+ added by unpicklable statements. When we pickle and store the globals after
+ executing a statement, we skip the ones in unpicklable_names.
+
+ Using Text instead of string is an optimization. We don't query on any of
+ these properties, so they don't need to be indexed.
+ """
+ global_names = db.ListProperty(db.Text)
+ globals = db.ListProperty(db.Blob)
+ unpicklable_names = db.ListProperty(db.Text)
+ unpicklables = db.ListProperty(db.Text)
+
+ def set_global(self, name, value):
+ """Adds a global, or updates it if it already exists.
+
+ Also removes the global from the list of unpicklable names.
+
+ Args:
+ name: the name of the global to remove
+ value: any picklable value
+ """
+ blob = db.Blob(pickle.dumps(value))
+
+ if name in self.global_names:
+ index = self.global_names.index(name)
+ self.globals[index] = blob
+ else:
+ self.global_names.append(db.Text(name))
+ self.globals.append(blob)
+
+ self.remove_unpicklable_name(name)
+
+ def remove_global(self, name):
+ """Removes a global, if it exists.
+
+ Args:
+ name: string, the name of the global to remove
+ """
+ if name in self.global_names:
+ index = self.global_names.index(name)
+ del self.global_names[index]
+ del self.globals[index]
+
+ def globals_dict(self):
+ """Returns a dictionary view of the globals.
+ """
+ return dict((name, pickle.loads(val))
+ for name, val in zip(self.global_names, self.globals))
+
+ def add_unpicklable(self, statement, names):
+ """Adds a statement and list of names to the unpicklables.
+
+ Also removes the names from the globals.
+
+ Args:
+ statement: string, the statement that created new unpicklable global(s).
+ names: list of strings; the names of the globals created by the statement.
+ """
+ self.unpicklables.append(db.Text(statement))
+
+ for name in names:
+ self.remove_global(name)
+ if name not in self.unpicklable_names:
+ self.unpicklable_names.append(db.Text(name))
+
+ def remove_unpicklable_name(self, name):
+ """Removes a name from the list of unpicklable names, if it exists.
+
+ Args:
+ name: string, the name of the unpicklable global to remove
+ """
+ if name in self.unpicklable_names:
+ self.unpicklable_names.remove(name)
+
+
+class FrontPageHandler(webapp.RequestHandler):
+ """Creates a new session and renders the shell.html template.
+ """
+
+ def get(self):
+ # set up the session. TODO: garbage collect old shell sessions
+ session_key = self.request.get('session')
+ if session_key:
+ session = ShellSession.get(session_key)
+ else:
+ # create a new session
+ session = ShellSession()
+ session.unpicklables = [db.Text(line) for line in INITIAL_UNPICKLABLES]
+ session_key = session.put()
+
+ template_file = os.path.join(os.path.dirname(__file__), 'templates',
+ 'shell.html')
+ session_url = '/?session=%s' % session_key
+ vars = { 'server_software': os.environ['SERVER_SOFTWARE'],
+ 'python_version': sys.version,
+ 'session': str(session_key),
+ 'user': users.get_current_user(),
+ 'login_url': users.create_login_url(session_url),
+ 'logout_url': users.create_logout_url(session_url),
+ }
+
+ rendered = loader.render_to_string('shell.html', dictionary=vars)
+ # rendered = webapp.template.render(template_file, vars, debug=_DEBUG)
+ self.response.out.write(rendered)
+
+
+class StatementHandler(webapp.RequestHandler):
+ """Evaluates a python statement in a given session and returns the result.
+ """
+
+ def get(self):
+ self.response.headers['Content-Type'] = 'text/plain'
+
+ # extract the statement to be run
+ statement = self.request.get('statement')
+ if not statement:
+ return
+
+ # the python compiler doesn't like network line endings
+ statement = statement.replace('\r\n', '\n')
+
+ # add a couple newlines at the end of the statement. this makes
+ # single-line expressions such as 'class Foo: pass' evaluate happily.
+ statement += '\n\n'
+
+ # log and compile the statement up front
+ try:
+ logging.info('Compiling and evaluating:\n%s' % statement)
+ compiled = compile(statement, '<string>', 'single')
+ except:
+ self.response.out.write(traceback.format_exc())
+ return
+
+ # create a dedicated module to be used as this statement's __main__
+ statement_module = new.module('__main__')
+
+ # use this request's __builtin__, since it changes on each request.
+ # this is needed for import statements, among other things.
+ import __builtin__
+ statement_module.__builtins__ = __builtin__
+
+ # load the session from the datastore
+ session = ShellSession.get(self.request.get('session'))
+
+ # swap in our custom module for __main__. then unpickle the session
+ # globals, run the statement, and re-pickle the session globals, all
+ # inside it.
+ old_main = sys.modules.get('__main__')
+ try:
+ sys.modules['__main__'] = statement_module
+ statement_module.__name__ = '__main__'
+
+ # re-evaluate the unpicklables
+ for code in session.unpicklables:
+ exec code in statement_module.__dict__
+
+ # re-initialize the globals
+ for name, val in session.globals_dict().items():
+ try:
+ statement_module.__dict__[name] = val
+ except:
+ msg = 'Dropping %s since it could not be unpickled.\n' % name
+ self.response.out.write(msg)
+ logging.warning(msg + traceback.format_exc())
+ session.remove_global(name)
+
+ # run!
+ old_globals = dict(statement_module.__dict__)
+ try:
+ old_stdout = sys.stdout
+ old_stderr = sys.stderr
+ try:
+ sys.stdout = self.response.out
+ sys.stderr = self.response.out
+ exec compiled in statement_module.__dict__
+ finally:
+ sys.stdout = old_stdout
+ sys.stderr = old_stderr
+ except:
+ self.response.out.write(traceback.format_exc())
+ return
+
+ # extract the new globals that this statement added
+ new_globals = {}
+ for name, val in statement_module.__dict__.items():
+ if name not in old_globals or val != old_globals[name]:
+ new_globals[name] = val
+
+ if True in [isinstance(val, UNPICKLABLE_TYPES)
+ for val in new_globals.values()]:
+ # this statement added an unpicklable global. store the statement and
+ # the names of all of the globals it added in the unpicklables.
+ session.add_unpicklable(statement, new_globals.keys())
+ logging.debug('Storing this statement as an unpicklable.')
+
+ else:
+ # this statement didn't add any unpicklables. pickle and store the
+ # new globals back into the datastore.
+ for name, val in new_globals.items():
+ if not name.startswith('__'):
+ session.set_global(name, val)
+
+ finally:
+ sys.modules['__main__'] = old_main
+
+ session.put()
+
+
+def main():
+ """Main program.
+ """
+
+ application = webapp.WSGIApplication(
+ [('/admin/shell', FrontPageHandler),
+ ('/admin/shell/shell.do', StatementHandler)], debug=_DEBUG)
+ wsgiref.handlers.CGIHandler().run(application)
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/shell/static/shell.js Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,195 @@
+// Copyright 2007 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @fileoverview
+ * Javascript code for the interactive AJAX shell.
+ *
+ * Part of http://code.google.com/p/google-app-engine-samples/.
+ *
+ * Includes a function (shell.runStatement) that sends the current python
+ * statement in the shell prompt text box to the server, and a callback
+ * (shell.done) that displays the results when the XmlHttpRequest returns.
+ *
+ * Also includes cross-browser code (shell.getXmlHttpRequest) to get an
+ * XmlHttpRequest.
+ */
+
+/**
+ * Shell namespace.
+ * @type {Object}
+ */
+var shell = {}
+
+/**
+ * The shell history. history is an array of strings, ordered oldest to
+ * newest. historyCursor is the current history element that the user is on.
+ *
+ * The last history element is the statement that the user is currently
+ * typing. When a statement is run, it's frozen in the history, a new history
+ * element is added to the end of the array for the new statement, and
+ * historyCursor is updated to point to the new element.
+ *
+ * @type {Array}
+ */
+shell.history = [''];
+
+/**
+ * See {shell.history}
+ * @type {number}
+ */
+shell.historyCursor = 0;
+
+/**
+ * A constant for the XmlHttpRequest 'done' state.
+ * @type Number
+ */
+shell.DONE_STATE = 4;
+
+/**
+ * A cross-browser function to get an XmlHttpRequest object.
+ *
+ * @return {XmlHttpRequest?} a new XmlHttpRequest
+ */
+shell.getXmlHttpRequest = function() {
+ if (window.XMLHttpRequest) {
+ return new XMLHttpRequest();
+ } else if (window.ActiveXObject) {
+ try {
+ return new ActiveXObject('Msxml2.XMLHTTP');
+ } catch(e) {
+ return new ActiveXObject('Microsoft.XMLHTTP');
+ }
+ }
+
+ return null;
+};
+
+/**
+ * This is the prompt textarea's onkeypress handler. Depending on the key that
+ * was pressed, it will run the statement, navigate the history, or update the
+ * current statement in the history.
+ *
+ * @param {Event} event the keypress event
+ * @return {Boolean} false to tell the browser not to submit the form.
+ */
+shell.onPromptKeyPress = function(event) {
+ var statement = document.getElementById('statement');
+
+ if (this.historyCursor == this.history.length - 1) {
+ // we're on the current statement. update it in the history before doing
+ // anything.
+ this.history[this.historyCursor] = statement.value;
+ }
+
+ // should we pull something from the history?
+ if (event.shiftKey && event.keyCode == 38 /* up arrow */) {
+ if (this.historyCursor > 0) {
+ statement.value = this.history[--this.historyCursor];
+ }
+ return false;
+ } else if (event.shiftKey && event.keyCode == 40 /* down arrow */) {
+ if (this.historyCursor < this.history.length - 1) {
+ statement.value = this.history[++this.historyCursor];
+ }
+ return false;
+ } else if (!event.altKey) {
+ // probably changing the statement. update it in the history.
+ this.historyCursor = this.history.length - 1;
+ this.history[this.historyCursor] = statement.value;
+ }
+
+ // should we submit?
+ var ctrlEnter = (document.getElementById('submit_key').value == 'ctrl-enter');
+ if (event.keyCode == 13 /* enter */ && !event.altKey && !event.shiftKey &&
+ event.ctrlKey == ctrlEnter) {
+ return this.runStatement();
+ }
+};
+
+/**
+ * The XmlHttpRequest callback. If the request succeeds, it adds the command
+ * and its resulting output to the shell history div.
+ *
+ * @param {XmlHttpRequest} req the XmlHttpRequest we used to send the current
+ * statement to the server
+ */
+shell.done = function(req) {
+ if (req.readyState == this.DONE_STATE) {
+ var statement = document.getElementById('statement')
+ statement.className = 'prompt';
+
+ // add the command to the shell output
+ var output = document.getElementById('output');
+
+ output.value += '\n>>> ' + statement.value;
+ statement.value = '';
+
+ // add a new history element
+ this.history.push('');
+ this.historyCursor = this.history.length - 1;
+
+ // add the command's result
+ var result = req.responseText.replace(/^\s*|\s*$/g, ''); // trim whitespace
+ if (result != '')
+ output.value += '\n' + result;
+
+ // scroll to the bottom
+ output.scrollTop = output.scrollHeight;
+ if (output.createTextRange) {
+ var range = output.createTextRange();
+ range.collapse(false);
+ range.select();
+ }
+ }
+};
+
+/**
+ * This is the form's onsubmit handler. It sends the python statement to the
+ * server, and registers shell.done() as the callback to run when it returns.
+ *
+ * @return {Boolean} false to tell the browser not to submit the form.
+ */
+shell.runStatement = function() {
+ var form = document.getElementById('form');
+
+ // build a XmlHttpRequest
+ var req = this.getXmlHttpRequest();
+ if (!req) {
+ document.getElementById('ajax-status').innerHTML =
+ "<span class='error'>Your browser doesn't support AJAX. :(</span>";
+ return false;
+ }
+
+ req.onreadystatechange = function() { shell.done(req); };
+
+ // build the query parameter string
+ var params = '';
+ for (i = 0; i < form.elements.length; i++) {
+ var elem = form.elements[i];
+ if (elem.type != 'submit' && elem.type != 'button' && elem.id != 'caret') {
+ var value = escape(elem.value).replace(/\+/g, '%2B'); // escape ignores +
+ params += '&' + elem.name + '=' + value;
+ }
+ }
+
+ // send the request and tell the user.
+ document.getElementById('statement').className = 'prompt processing';
+ req.open(form.method, form.action + '?' + params, true);
+ req.setRequestHeader('Content-type',
+ 'application/x-www-form-urlencoded;charset=UTF-8');
+ req.send(null);
+
+ return false;
+};
Binary file app/shell/static/spinner.gif has changed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/shell/templates/shell.html Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,124 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8" />
+<title> Interactive Shell </title>
+<script type="text/javascript" src="/static/shell.js"></script>
+<style type="text/css">
+body {
+ font-family: monospace;
+ font-size: 10pt;
+}
+
+p {
+ margin: 0.5em;
+}
+
+.prompt, #output {
+ width: 45em;
+ border: 1px solid silver;
+ background-color: #f5f5f5;
+ font-size: 10pt;
+ margin: 0.5em;
+ padding: 0.5em;
+ padding-right: 0em;
+ overflow-x: hidden;
+}
+
+#toolbar {
+ margin-left: 0.5em;
+ padding-left: 0.5em;
+}
+
+#caret {
+ width: 2.5em;
+ margin-right: 0px;
+ padding-right: 0px;
+ border-right: 0px;
+}
+
+#statement {
+ width: 43em;
+ margin-left: -1em;
+ padding-left: 0px;
+ border-left: 0px;
+ background-position: top right;
+ background-repeat: no-repeat;
+}
+
+.processing {
+ background-image: url("/static/spinner.gif");
+}
+
+#ajax-status {
+ font-weight: bold;
+}
+
+.message {
+ color: #8AD;
+ font-weight: bold;
+ font-style: italic;
+}
+
+.error {
+ color: #F44;
+}
+
+.username {
+ font-weight: bold;
+}
+
+</style>
+</head>
+
+<body>
+
+<p> Interactive server-side Python shell
+ (<a href="http://code.google.com/p/google-app-engine-samples/source/browse/#svn/trunk/shell">original source</a>)
+</p>
+<p>
+ <a href="/">Return to main home</a>
+</p>
+
+<textarea id="output" rows="30" readonly="readonly">
+{{ server_software }}
+Python {{ python_version }}
+</textarea>
+
+<form id="form" action="/admin/shell/shell.do" method="get">
+ <nobr>
+ <textarea class="prompt" id="caret" readonly="readonly" rows="4"
+ onfocus="document.getElementById('statement').focus()"
+ >>>></textarea>
+ <textarea class="prompt" name="statement" id="statement" rows="4"
+ onkeypress="return shell.onPromptKeyPress(event);"></textarea>
+ </nobr>
+ <input type="hidden" name="session" value="{{ session }}" />
+ <input type="submit" style="display: none" />
+</form>
+
+<p id="ajax-status"></p>
+
+<p id="toolbar">
+{% if user %}
+ <span class="username">{{ user.nickname }}</span>
+ (<a href="{{ logout_url }}">log out</a>)
+{% else %}
+ <a href="{{ login_url }}">log in</a>
+{% endif %}
+ | Shift-Up/Down for history |
+<select id="submit_key">
+ <option value="enter">Enter</option>
+ <option value="ctrl-enter" selected="selected">Ctrl-Enter</option>
+</select>
+<label for="submit_key">submits</label>
+</p>
+
+<script type="text/javascript">
+document.getElementById('statement').focus();
+</script>
+
+</body>
+</html>
+
--- a/app/soc/cron/job.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/cron/job.py Tue May 26 02:37:39 2009 +0200
@@ -29,8 +29,10 @@
from google.appengine.runtime import DeadlineExceededError
from soc.cron import student_proposal_mailer
+from soc.cron import unique_user_id_adder
from soc.models.job import Job
+
class Error(Exception):
"""Base class for all exceptions raised by this module.
"""
@@ -67,6 +69,10 @@
student_proposal_mailer.setupStudentProposalMailing
self.tasks['sendStudentProposalMail'] = \
student_proposal_mailer.sendStudentProposalMail
+ self.tasks['setupUniqueUserIdAdder'] = \
+ unique_user_id_adder.setupUniqueUserIdAdder
+ self.tasks['addUniqueUserIds'] = \
+ unique_user_id_adder.addUniqueUserIds
def claimJob(self, job_key):
"""A transaction to claim a job.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/soc/cron/unique_user_id_adder.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,135 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cron job handler for adding unique user id.
+"""
+
+__authors__ = [
+ '"Pawel Solyga" <pawel.solyga@gmail.com>',
+ ]
+
+
+from google.appengine.ext import db
+from google.appengine.api import users
+from soc.logic.models.job import logic as job_logic
+from soc.logic.models.priority_group import logic as priority_logic
+from soc.logic.models.user import logic as user_logic
+from soc.models.user import User
+
+
+# amount of users to create jobs for before updating
+DEF_USER_STEP_SIZE = 10
+
+
+class TempUserWithUniqueId(db.Model):
+ """Helper model for temporary storing User Property with unique id.
+ """
+ user = db.UserProperty(required=True)
+
+
+def emailToAccountAndUserId(address):
+ """Return a stable user_id string based on an email address, or None if
+ the address is not a valid/existing google account.
+ """
+ user = users.User(address)
+ key = TempUserWithUniqueId(user=user).put()
+ obj = TempUserWithUniqueId.get(key)
+ return (obj, obj.user.user_id())
+
+
+def setupUniqueUserIdAdder(job_entity):
+ """Job that setup jobs that will add unique user ids to all Users.
+
+ Args:
+ job_entity: a Job entity with key_data set to
+ [last_completed_user]
+ """
+
+ from soc.cron.job import FatalJobError
+
+ user_fields = {'user_id': None}
+
+ if len(key_data) == 1:
+ # start where we left off
+ user_fields['__key__ >'] = key_data[0]
+
+ m_users = user_logic.getForFields(user_fields,
+ limit=DEF_USER_STEP_SIZE)
+
+ # set the default fields for the jobs we are going to create
+ priority_group = priority_logic.getGroup(priority_logic.CONVERT)
+ job_fields = {
+ 'priority_group': priority_group,
+ 'task_name': 'addUniqueUserIds'}
+
+ job_query_fields = job_fields.copy()
+
+ while m_users:
+ # for each user create a adder job
+ for user in m_users:
+
+ job_query_fields['key_data'] = user.key()
+ adder_job = job_logic.getForFields(job_query_fields, unique=True)
+
+ if not adder_job:
+ # this user doesn't have unique id yet
+ job_fields['key_data'] = [user.key()]
+ job_logic.updateOrCreateFromFields(job_fields)
+
+ # update our own job
+ last_user_key = m_users[-1].key()
+
+ if len(key_data) == 1:
+ key_data[0] = last_student_key
+ else:
+ key_data.append(last_student_key)
+
+ updated_job_fields = {'key_data': key_data}
+ job_logic.updateEntityProperties(job_entity, updated_job_fields)
+
+ # rinse and repeat
+ user_fields['__key__ >'] = last_user_key
+ m_users = student_logic.getForFields(user_fields,
+ limit=DEF_USER_STEP_SIZE)
+
+ # we are finished
+ return
+
+
+def addUniqueUserIds(job_entity):
+ """Job that will add unique user id to a User.
+
+ Args:
+ job_entity: a Job entity with key_data set to [user_key]
+ """
+
+ from soc.cron.job import FatalJobError
+
+ user_keyname = job_entity.key_data[0].name()
+ user_entity = user_logic.getFromKeyName(user_keyname)
+
+ if not user_entity:
+ raise FatalJobError('The User with keyname %s does not exist!' % (
+ user_keyname))
+
+ # add unique user id
+ account, user_id = emailToAccountAndUserId(user_entity.account.email())
+ user_entity.account = account
+ user_entity.user_id = user_id
+ user_entity.put()
+
+ # we are done here
+ return
\ No newline at end of file
--- a/app/soc/logic/accounts.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/logic/accounts.py Tue May 26 02:37:39 2009 +0200
@@ -35,6 +35,13 @@
return normalizeAccount(account) if (account and normalize) else account
+def getCurrentUserId():
+ """Returns a unique id of the current user.
+ """
+
+ return users.get_current_user().user_id()
+
+
def normalizeAccount(account):
"""Returns a normalized version of the specified account.
"""
@@ -46,6 +53,7 @@
return users.User(email=normalized)
+
def denormalizeAccount(account):
"""Returns a denormalized version of the specified account.
"""
@@ -58,6 +66,7 @@
return users.User(email=denormalized)
+
def isDeveloper(account=None):
"""Returns True if a Google Account is a Developer with special privileges.
--- a/app/soc/logic/cleaning.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/logic/cleaning.py Tue May 26 02:37:39 2009 +0200
@@ -21,10 +21,11 @@
'"Todd Larsen" <tlarsen@google.com>',
'"Sverre Rabbelier" <sverre@rabbelier.nl>',
'"Lennard de Rijk" <ljvderijk@gmail.com>',
+ '"Pawel Solyga" <pawel.solyga@gmail.com>',
]
-import feedparser
+from htmlsanitizer import HtmlSanitizer
from google.appengine.api import users
@@ -379,16 +380,25 @@
def wrapped(self):
"""Decorator wrapper method.
"""
+ from HTMLParser import HTMLParseError
content = self.cleaned_data.get(field_name)
+ # clean_html_content is called when writing data into GAE rather than
+ # when reading data from GAE. This short-circuiting of the sanitizer
+ # only affects html authored by developers. The isDeveloper test for
+ # example allows developers to add javascript.
if user_logic.isDeveloper():
return content
-
- sanitizer = feedparser._HTMLSanitizer('utf-8')
- sanitizer.feed(content)
- content = sanitizer.output()
- content = content.decode('utf-8')
+
+ try:
+ cleaner = HtmlSanitizer.Cleaner()
+ cleaner.string = content
+ cleaner.clean()
+ except HTMLParseError, msg:
+ raise forms.ValidationError(msg)
+
+ content = cleaner.string
content = content.strip().replace('\r\n', '\n')
return content
--- a/app/soc/logic/helper/notifications.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/logic/helper/notifications.py Tue May 26 02:37:39 2009 +0200
@@ -191,6 +191,7 @@
'scope_path': to_user.link_id
}
+ import soc.logic.models.notification
key_name = model_logic.notification.logic.getKeyNameFromFields(fields)
# create and put a new notification in the datastore
@@ -204,6 +205,8 @@
notification_entity: Notification about which the message should be sent
"""
+ import soc.views.models.notification
+
# create the url to show this notification
notification_url = "http://%(host)s%(index)s" % {
'host' : os.environ['HTTP_HOST'],
--- a/app/soc/logic/models/base.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/logic/models/base.py Tue May 26 02:37:39 2009 +0200
@@ -25,6 +25,8 @@
]
+import logging
+
from google.appengine.ext import db
from django.utils.translation import ugettext
@@ -324,7 +326,13 @@
query = self.getQueryForFields(filter=filter, order=order)
- result = query.fetch(limit, offset)
+ try:
+ result = query.fetch(limit, offset)
+ except db.NeedIndexError, exception:
+ result = []
+ logging.exception("%s, model: %s filter: %s, order: %s" %
+ (exception, self._model, filter, order))
+ # TODO: send email
if unique:
return result[0] if result else None
--- a/app/soc/logic/models/user.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/logic/models/user.py Tue May 26 02:37:39 2009 +0200
@@ -75,6 +75,20 @@
return self.getForAccount(account)
+ def getForCurrentUserId(self):
+ """Retrieves the user entity for the currently logged in user id.
+
+ If there is no user logged in, or they have no valid associated User
+ entity, None is returned.
+ """
+
+ user_id = accounts.getCurrentUserId()
+
+ if not user_id:
+ return None
+
+ return self.getForUserId(user_id)
+
def getForAccount(self, account):
"""Retrieves the user entity for the specified account.
@@ -94,6 +108,23 @@
return self.getForFields(filter=fields, unique=True)
+ def getForUserId(self, user_id):
+ """Retrieves the user entity for the specified user id.
+
+ If there is no user logged in, or they have no valid associated User
+ entity, None is returned.
+ """
+
+ if not user_id:
+ raise base.InvalidArgumentError
+
+ fields = {
+ 'user_id': user_id,
+ 'status':'valid',
+ }
+
+ return self.getForFields(filter=fields, unique=True)
+
def isDeveloper(self, account=None, user=None):
"""Returns true iff the specified user is a Developer.
--- a/app/soc/models/seed_db.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/models/seed_db.py Tue May 26 02:37:39 2009 +0200
@@ -139,7 +139,7 @@
def seed(self, i, entities=None, current_user=None, gsoc2009=None):
properties = {
- 'key_name': 'google/gsoc2009/%04d' % i,
+ 'key_name': 'google/gsoc2009/org_%04d' % i,
'link_id': 'org_%04d' % i,
'name': 'Organization %04d' % i,
'short_name': 'Org %04d' % i,
@@ -176,6 +176,58 @@
gsoc2009=gsoc2009)
+class OrgApplicationSeeder(Seeder):
+ def type(self):
+ return OrgApplication
+
+ def commonSeedArgs(self, request):
+ _, current_user = ensureUser()
+ gsoc2009 = Program.get_by_key_name('google/gsoc2009')
+
+ if not gsoc2009:
+ raise Error('Run seed_db first')
+
+ status = request.GET.get('status', 'pre-accepted')
+
+ return dict(current_user=current_user,
+ gsoc2009=gsoc2009,
+ status=status)
+
+
+ def seed(self, i, entities=None, current_user=None, gsoc2009=None,
+ status=None):
+ properties = {
+ 'key_name': 'google/gsoc2009/org_%04d' % i,
+ 'link_id': 'org_%04d' % i,
+ 'name': 'Org App %04d' % i,
+ 'scope_path': 'google/gsoc2009',
+ 'scope': gsoc2009,
+ 'status': status,
+ 'applicant': current_user,
+ 'home_page': 'http://www.google.com',
+ 'email': 'org@example.com',
+ 'irc_channel': '#care',
+ 'pub_mailing_list': 'http://groups.google.com',
+ 'dev_mailing_list': 'http://groups.google.com',
+ 'description': 'This is an awesome org!',
+ 'why_applying': 'Because we can',
+ 'member_criteria': 'They need to be awesome',
+ 'license_name': 'Apache License, 2.0',
+ 'ideas': 'http://code.google.com/p/soc/issues',
+ 'contrib_disappears': 'We use google to find them',
+ 'member_disappears': 'See above',
+ 'encourage_contribs': 'We offer them cookies.',
+ 'continued_contribs': 'We promise them a cake.',
+ 'agreed_to_admin_agreement': True,
+ }
+
+ org_application = OrgApplication(**properties)
+ if entities is None:
+ org_application.put()
+ else:
+ entities.append(org_application)
+
+
def seed(request, *args, **kwargs):
"""Seeds the datastore with some default values.
"""
@@ -322,9 +374,9 @@
}
for i in range(10):
- org_app_properties['key_name'] = 'google/gsoc2009/wannabe_%d' % i
- org_app_properties['link_id'] = 'wannabe_%d' % i
- org_app_properties['name'] = 'Wannabe %d' % i
+ org_app_properties['key_name'] = 'google/gsoc2009/org_%04d' % i
+ org_app_properties['link_id'] = 'org_%04d' % i
+ org_app_properties['name'] = 'Org App %04d' % i
entity = OrgApplication(**org_app_properties)
entity.put()
@@ -479,8 +531,8 @@
raise Error('Run seed_db first')
properties = {
- 'key_name': 'google/gsoc2009/org_app_%d' % i,
- 'link_id': 'org_app_%d' % i,
+ 'key_name': 'google/gsoc2009/org_%d' % i,
+ 'link_id': 'org_%d' % i,
'name': 'Org App %d' % i,
'scope_path': 'google/gsoc2009',
'scope': gsoc2009,
@@ -577,16 +629,16 @@
def seed_student(request, i):
"""Returns the properties for a new student entity.
"""
-
+
gsoc2009 = Program.get_by_key_name('google/gsoc2009')
user = User.get_by_key_name('user_%d' % i)
-
+
if not gsoc2009:
raise Error('Run seed_db first')
-
+
if not user:
raise Error('Run seed_many for at least %d users first.' % i)
-
+
properties = {
'key_name':'google/gsoc2009/student_%d' % i,
'link_id': 'student_%d' % i,
@@ -627,13 +679,13 @@
mentor = Mentor.get_by_key_name('google/gsoc2009/org_%d/mentor' % i)
user = User.get_by_key_name('user_%d' % i)
student = Student.get_by_key_name('google/gsoc2009/student_%d' % i)
-
+
if not user:
raise Error('Run seed_many for at least %d users first.' % i)
if not student:
raise Error('Run seed_many for at least %d students first.' % i)
-
+
if not org:
raise Error('Run seed_many for at least %d orgs first.' % i)
@@ -669,6 +721,7 @@
SEEDABLE_MODEL_TYPES = {
'user' : UserSeeder(),
'organization' : OrganizationSeeder(),
+ 'org_application' : OrgApplicationSeeder(),
}
@@ -716,7 +769,7 @@
# so, we look for what's after the _ and turn it into an int.
link_id = highest_instance.link_id
if '_' in link_id:
- start_index = int(link_id.split('_')[1]) + 1
+ start_index = int(link_id.split('_')[-1]) + 1
else:
# couldn't find seeded_entities; guessing there are none
start_index = 0
--- a/app/soc/models/user.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/models/user.py Tue May 26 02:37:39 2009 +0200
@@ -71,6 +71,9 @@
verbose_name=ugettext('User account'))
account.help_text = ugettext(
'A valid Google Account.')
+
+ #: Google Account unique user id
+ user_id = db.StringProperty(required=False)
#: A list (possibly empty) of former Google Accounts associated with
#: this User.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/soc/modules/__init__.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,17 @@
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This is the main modules module.
+"""
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/soc/modules/callback.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,40 @@
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module containing Melange callbacks.
+"""
+
+__authors__ = [
+ '"Sverre Rabbelier" <sverre@rabbelier.nl>',
+ '"Lennard de Rijk" <ljvderijk@gmail.com>',
+ ]
+
+
+CORE = None
+
+
+def registerCore(core):
+ """Registers the specified callback as core.
+ """
+
+ global CORE
+ CORE = core
+
+
+def getCore():
+ """Returns the Core handler.
+ """
+
+ global CORE
+ return CORE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/soc/modules/core.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,235 @@
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The Melange Core module.
+"""
+
+__authors__ = [
+ '"Sverre Rabbelier" <sverre@rabbelier.nl>',
+ '"Lennard de Rijk" <ljvderijk@gmail.com>',
+ ]
+
+
+from django.conf.urls import defaults
+
+import settings
+import soc.cache.sidebar
+
+
+class Error(Exception):
+ """Error class for the callback module.
+ """
+
+ pass
+
+
+class APIVersionMismatch(Error):
+ """Error raised when API version mismatches.
+ """
+
+ MISMATCH_MSG_FMT = "API mismatch, expected '%d', got '%d'."
+
+ def __init__(self, expected, actual):
+ """Instantiates a new exception with a customized message.
+ """
+
+ msg = self.MISMATCH_MSG_FMT % (expected, actual)
+ super(APIVersionMismatch, self).__init__(msg)
+
+
+class MissingService(Error):
+ """Error raised when a required service is missing.
+ """
+
+ MISSING_SERVICE_FMT = "Required service '%s' is not registered, known: %s"
+
+ def __init__(self, service, services):
+ """Instantiates a new exception with a customized message.
+ """
+
+ msg = self.MISSING_SERVICE_FMT % (service, services)
+ super(MissingService, self).__init__(msg)
+
+
+class NonUniqueService(Error):
+ """Error raised when a required service is missing.
+ """
+
+ NON_UNIQUE_SERVICE_FMT = "Unique service '%s' called a second time, known: %s."
+
+ def __init__(self, service, services):
+ """Instantiates a new exception with a customized message.
+ """
+
+ msg = self.NON_UNIQUE_SERVICE_FMT % (service, services)
+ super(NonUniqueService, self).__init__(msg)
+
+
+class Core(object):
+ """The core handler that controls the Melange API.
+ """
+
+ def __init__(self):
+ """Creates a new instance of the Core.
+ """
+
+ self.API_VERSION = 1
+
+ self.registered_callbacks = []
+ self.capability = []
+ self.services = []
+
+ self.sitemap = []
+ self.sidebar = []
+
+ ##
+ ## internal
+ ##
+
+ def getService(self, callback, service):
+ """Retrieves the specified service from the callback if supported.
+
+ Args:
+ callback: the callback to retrieve the capability from
+ service: the service to retrieve
+ """
+
+ if not hasattr(callback, service):
+ return False
+
+ func = getattr(callback, service)
+
+ if not callable(func):
+ return False
+
+ return func
+
+ ##
+ ## Core code
+ ##
+
+ def getPatterns(self):
+ """Returns the Django patterns for this site.
+ """
+
+ self.callService('registerWithSitemap', True)
+ return defaults.patterns(None, *self.sitemap)
+
+ @soc.cache.sidebar.cache
+ def getSidebar(self, id, user):
+ """Constructs a sidebar for the current user.
+ """
+
+ self.callService('registerWithSidebar', True)
+
+ sidebar = []
+
+ for i in self.sidebar:
+ menus = i(id, user)
+
+ for menu in (menus if menus else []):
+ sidebar.append(menu)
+
+ return sorted(sidebar, key=lambda x: x.get('group'))
+
+ def callService(self, service, unique, *args, **kwargs):
+ """Calls the specified service on all callbacks.
+ """
+
+ if unique and (service in self.services):
+ return
+
+ results = []
+
+ for callback in self.registered_callbacks:
+ func = self.getService(callback, service)
+ if not func:
+ continue
+
+ result = func(*args, **kwargs)
+ results.append(result)
+
+ self.services.append(service)
+ return results
+
+ def registerModuleCallbacks(self):
+ """Retrieves all callbacks for the modules of this site.
+
+ Callbacks for modules without a version number or the wrong API_VERSION
+ number are dropped. They won't be called.
+ """
+
+ fmt = settings.MODULE_FMT
+ modules = ['soc_core'] + settings.MODULES
+ modules = [__import__(fmt % i, fromlist=['']) for i in modules]
+
+ for callback_class in [i.getCallback() for i in modules]:
+ if callback_class.API_VERSION != self.API_VERSION:
+ raise callback.APIVersionMismatch(self.API_VERSION,
+ callback_class.API_VERSION)
+
+
+ callback = callback_class(self)
+ self.registered_callbacks.append(callback)
+
+ return True
+
+ ##
+ ## Module code
+ ##
+
+ def registerCapability(self, capability):
+ """Registers the specified capability.
+ """
+
+ self.capabilities.append(capability)
+
+ def requireCapability(self, capability):
+ """Requires that the specified capability is present.
+ """
+
+ if capability in self.capabilities:
+ return True
+
+ raise MissingCapability(capability, self.capability)
+
+ def requireService(self, service):
+ """Requires that the specified service has been called.
+ """
+
+ if service in self.services:
+ return True
+
+ raise MissingService(service, self.services)
+
+ def requireUniqueService(self, service):
+ """Requires that the specified service is called exactly once.
+ """
+
+ if service not in self.services:
+ return True
+
+ raise NonUniqueService(service, self.services)
+
+ def registerSitemapEntry(self, entries):
+ """Registers the specified entries with the sitemap.
+ """
+
+ self.sitemap.extend(entries)
+
+ def registerSidebarEntry(self, entry):
+ """Registers the specified entry with the sidebar.
+ """
+
+ self.sidebar.append(entry)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/soc/modules/soc_core/__init__.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,29 @@
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This is the main modules module.
+"""
+
+__authors__ = [
+ '"Sverre Rabbelier" <sverre@rabbelier.nl>',
+ '"Lennard de Rijk" <ljvderijk@gmail.com>',
+ ]
+
+
+
+from soc.modules.soc_core import callback
+
+def getCallback():
+ return callback.Callback
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app/soc/modules/soc_core/callback.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,129 @@
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module containing the core callback.
+"""
+
+__authors__ = [
+ '"Sverre Rabbelier" <sverre@rabbelier.nl>',
+ '"Lennard de Rijk" <ljvderijk@gmail.com>',
+ ]
+
+
+from soc.modules import callback
+
+from soc.views.models import club
+from soc.views.models import club_app
+from soc.views.models import club_admin
+from soc.views.models import club_member
+from soc.views.models import cron
+from soc.views.models import document
+from soc.views.models import host
+from soc.views.models import job
+from soc.views.models import mentor
+from soc.views.models import notification
+from soc.views.models import organization
+from soc.views.models import org_admin
+from soc.views.models import org_app
+from soc.views.models import priority_group
+from soc.views.models import program
+from soc.views.models import request
+from soc.views.models import site
+from soc.views.models import sponsor
+from soc.views.models import student
+from soc.views.models import student_project
+from soc.views.models import student_proposal
+from soc.views.models import timeline
+from soc.views.models import user
+from soc.views.models import user_self
+
+
+class Callback(object):
+ """Callback object that handles interaction between the core.
+ """
+
+ API_VERSION = 1
+
+ def __init__(self, core):
+ """Initializes a new Callback object for the specified core.
+ """
+
+ self.core = core
+
+ # disable clubs
+ self.enable_clubs = False
+
+ def registerWithSitemap(self):
+ """Called by the server when sitemap entries should be registered.
+ """
+
+ self.core.requireUniqueService('registerWithSitemap')
+
+ if self.enable_clubs:
+ self.core.registerSitemapEntry(club.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(club_admin.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(club_app.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(club_member.view.getDjangoURLPatterns())
+
+ self.core.registerSitemapEntry(cron.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(document.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(host.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(job.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(mentor.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(notification.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(organization.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(org_admin.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(org_app.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(priority_group.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(program.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(request.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(site.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(sponsor.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(student.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(student_project.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(student_proposal.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(timeline.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(user_self.view.getDjangoURLPatterns())
+ self.core.registerSitemapEntry(user.view.getDjangoURLPatterns())
+
+ def registerWithSidebar(self):
+ """Called by the server when sidebar entries should be registered.
+ """
+
+ self.core.requireUniqueService('registerWithSidebar')
+
+ if self.enable_clubs:
+ self.core.registerSidebarEntry(club.view.getSidebarMenus)
+ self.core.registerSidebarEntry(club.view.getExtraMenus)
+ self.core.registerSidebarEntry(club_admin.view.getSidebarMenus)
+ self.core.registerSidebarEntry(club_member.view.getSidebarMenus)
+ self.core.registerSidebarEntry(club_app.view.getSidebarMenus)
+
+ self.core.registerSidebarEntry(user_self.view.getSidebarMenus)
+ self.core.registerSidebarEntry(site.view.getSidebarMenus)
+ self.core.registerSidebarEntry(user.view.getSidebarMenus)
+ self.core.registerSidebarEntry(sponsor.view.getSidebarMenus)
+ self.core.registerSidebarEntry(sponsor.view.getExtraMenus)
+ self.core.registerSidebarEntry(host.view.getSidebarMenus)
+ self.core.registerSidebarEntry(request.view.getSidebarMenus)
+ self.core.registerSidebarEntry(program.view.getSidebarMenus)
+ self.core.registerSidebarEntry(program.view.getExtraMenus)
+ self.core.registerSidebarEntry(student.view.getSidebarMenus)
+ self.core.registerSidebarEntry(student_project.view.getSidebarMenus)
+ self.core.registerSidebarEntry(student_proposal.view.getSidebarMenus)
+ self.core.registerSidebarEntry(organization.view.getSidebarMenus)
+ self.core.registerSidebarEntry(organization.view.getExtraMenus)
+ self.core.registerSidebarEntry(org_admin.view.getSidebarMenus)
+ self.core.registerSidebarEntry(mentor.view.getSidebarMenus)
+ self.core.registerSidebarEntry(org_app.view.getSidebarMenus)
--- a/app/soc/templates/soc/club_admin/manage.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/club_admin/manage.html Tue May 26 02:37:39 2009 +0200
@@ -23,7 +23,17 @@
<td>
Please select the appropriate action:</br>
<input type="button" onclick="location.href='/{{ url_name }}/manage/{{ entity.scope_path }}/{{ entity.link_id }}?resign=true'" value="Resign"/>
- <input type="button" onclick="location.href='{{ cancel_redirect }}'" value="Cancel"/>
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
</tr>
{% endblock %}
--- a/app/soc/templates/soc/club_member/manage.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/club_member/manage.html Tue May 26 02:37:39 2009 +0200
@@ -23,7 +23,17 @@
<td>
Please select the appropriate action:</br>
<input type="button" onclick="location.href='/{{ url_name }}/manage/{{ entity.scope_path }}/{{ entity.link_id }}?resign=true'" value="Resign"/>
- <input type="button" onclick="location.href='{{ cancel_redirect }}'" value="Cancel"/>
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
</tr>
{% endblock %}
--- a/app/soc/templates/soc/host/manage.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/host/manage.html Tue May 26 02:37:39 2009 +0200
@@ -23,7 +23,17 @@
<td>
Please select the appropriate action:</br>
<input type="button" onclick="location.href='/{{ url_name }}/manage/{{ entity.scope_path }}/{{ entity.link_id }}?resign=true'" value="Resign"/>
- <input type="button" onclick="location.href='{{ cancel_redirect }}'" value="Cancel"/>
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
</tr>
{% endblock %}
--- a/app/soc/templates/soc/mentor/manage.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/mentor/manage.html Tue May 26 02:37:39 2009 +0200
@@ -23,7 +23,17 @@
<td>
Please select the appropriate action:</br>
<input type="button" onclick="location.href='/{{ url_name }}/manage/{{ entity.scope_path }}/{{ entity.link_id }}?resign=true'" value="Resign"/>
- <input type="button" onclick="location.href='{{ cancel_redirect }}'" value="Cancel"/>
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
</tr>
{% endblock %}
--- a/app/soc/templates/soc/models/edit.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/models/edit.html Tue May 26 02:37:39 2009 +0200
@@ -58,11 +58,17 @@
{% endif %}
{% endblock %}
<td>
- {% if edit_cancel_redirect %}
- <input type="button" onclick="location.href='{{ edit_cancel_redirect }}'" value="Cancel"/>
- {% else %}
- <input type="button" value="Back to Previous Page" onClick="javascript: history.go(-1)">
- {% endif %}
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
{% if entity %}
{% block delete_button %}
--- a/app/soc/templates/soc/notification/list/row.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/notification/list/row.html Tue May 26 02:37:39 2009 +0200
@@ -11,5 +11,5 @@
</div>
</td>
<td><div class="subject">{{ list.item.subject }}</div></td>
- <td><div class="created_on">{{ list.item.created_on }} </div> </td>
+ <td><div class="created_on">{{ list.item.created_on|date:"jS F Y H:i" }} </div> </td>
</tr>
--- a/app/soc/templates/soc/org_admin/manage.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/org_admin/manage.html Tue May 26 02:37:39 2009 +0200
@@ -23,7 +23,17 @@
<td>
Please select the appropriate action:</br>
<input type="button" onclick="location.href='/{{ url_name }}/manage/{{ entity.scope_path }}/{{ entity.link_id }}?resign=true'" value="Resign"/>
- <input type="button" onclick="location.href='{{ cancel_redirect }}'" value="Cancel"/>
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
</tr>
{% endblock %}
--- a/app/soc/templates/soc/student/manage.html Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/templates/soc/student/manage.html Tue May 26 02:37:39 2009 +0200
@@ -23,7 +23,17 @@
<td>
Please select the appropriate action:</br>
<input type="button" onclick="location.href='/{{ url_name }}/manage/{{ entity.scope_path }}/{{ entity.link_id }}?resign=true'" value="Resign"/>
- <input type="button" onclick="location.href='{{ cancel_redirect }}'" value="Cancel"/>
+ {% if cancel_redirect %}
+ <input type="button"
+ {% if entity %}
+ onclick="location.href='{{ cancel_redirect }}'"
+ {% else %}
+ onClick="javascript: history.go(-1)">
+ {% endif %}
+ value="Cancel"/>
+ {% else %}
+ <input type="button" value="Back to Previous Page" onClick="javascript:history.go(-1)">
+ {% endif %}
</td>
</tr>
{% endblock %}
--- a/app/soc/views/helper/lists.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/helper/lists.py Tue May 26 02:37:39 2009 +0200
@@ -22,6 +22,7 @@
'"Pawel Solyga" <pawel.solyga@gmail.com>',
]
+import logging
from soc.logic import dicts
from soc.logic.models.user import logic as user_logic
@@ -60,8 +61,6 @@
OFFSET_KEY = 'offset_%d'
LIMIT_KEY = 'limit_%d'
-OFFSET_KEYNAME_KEY = 'offset_keyname_%d'
-REVERSE_DIRECTION_KEY = 'reverse_sort_direction_%d'
def makeOffsetKey(limit_idx):
@@ -72,14 +71,6 @@
return LIMIT_KEY % limit_idx
-def makeOffsetKeynameKey(limit_idx):
- return OFFSET_KEYNAME_KEY % limit_idx
-
-
-def makeReverseDirectionKey(limit_idx):
- return REVERSE_DIRECTION_KEY % limit_idx
-
-
def getListParameters(request, list_index):
"""Retrieves, converts and validates values for one list
@@ -119,44 +110,30 @@
else:
limit = min(DEF_MAX_PAGINATION, limit)
- result = dict(limit=limit, offset=offset)
- offset_keyname_key = makeOffsetLinkidKey(list_index)
- offset_keyname = request.GET.get(offset_keyname_key, '')
- # TODO(dbentley): URL unescape
- result['offset_keyname'] = offset_keyname
-
- reverse_direction = makeReverseDirectionKey(list_index) in request.GET
- result['reverse_direction'] = reverse_direction
-
- return result
+ return dict(limit=limit, offset=offset)
-class LinkCreator(object):
- """A way to create links for a page.
+def generateLinkFromGetArgs(request, offset_and_limits):
+ """Constructs the get args for the url.
"""
- def __init__(self, request, list_idx, limit):
- self.path = request.path
- self.base_params = dict(
- i for i in request.GET.iteritems() if
- i[0].startswith('offset_') or i[0].startswith('limit_'))
- self.idx = list_idx
- self.base_params[makeLimitKey(self.idx)] = limit
+
+ args = ["%s=%s" % (k, v) for k, v in offset_and_limits.iteritems()]
+ link_suffix = '?' + '&'.join(args)
+
+ return request.path + link_suffix
+
- def create(self, offset_keyname=None, export=False, reverse_direction=False):
- params = self.base_params.copy()
- if offset_linkid is not None:
- # TODO(dbentley): URL encode
- if offset_linkid == '':
- try:
- del params[makeOffsetLinkidKey(self.idx)]
- except KeyError:
- pass
- else:
- params[makeOffsetLinkidKey(self.idx)]=offset_linkid
- if reverse_direction:
- params[makeReverseDirectionKey(self.idx)]=True
- link_suffix = '&'.join('%s=%s' % (k, v) for k, v in params.iteritems())
- return '%s?%s' % (self.path, link_suffix)
+def generateLinkForRequest(request, base_params, updated_params):
+ """Create a link to the same page as request but with different params
+
+ Params:
+ request: the request for the page
+ base_params: the base parameters
+ updated_params: the parameters to update
+ """
+ params = base_params.copy()
+ params.update(updated_params)
+ return generateLinkFromGetArgs(request, params)
def getListContent(request, params, filter=None, order=None,
@@ -193,38 +170,16 @@
'last': offset of the last item in the list
}
"""
-
+ # TODO(dbentley): this appears to be unnecessary indirection,
+ # as we only use this logic for getForFields, which is never overridden
logic = params['logic']
- limit_key = makeLimitKey(idx)
- offset_key = makeOffsetKey(idx)
- offset_keyname_key = makeOffsetKeynameKey(idx)
- reverse_direction_key = makeReverseDirectionKey(idx)
+ limit_key, offset_key = makeLimitKey(idx), makeOffsetKey(idx)
list_params = getListParameters(request, idx)
-
- limit = list_params['limit']
- offset = list_params['offset']
- offset_keyname = list_params['offset_keyname']
- reverse_direction = list_params['reverse_direction']
-
- pagination_form = makePaginationForm(request, limit, limit_key)
-
- if offset_keyname:
- if filter is None:
- filter = {}
-
- if reverse_direction:
- filter['__key__ <'] = offset_keyname
- else:
- filter['__key__ >'] = offset_keyname
-
- if order is None:
- order = []
- if reverse_direction:
- order.append('-__key__')
- else:
- order.append('__key__')
+ limit, offset = list_params['limit'], list_params['offset']
+ pagination_form = makePaginationForm(request, list_params['limit'],
+ limit_key)
# Fetch one more to see if there should be a 'next' link
data = logic.getForFields(filter=filter, limit=limit+1, offset=offset,
@@ -234,60 +189,46 @@
return None
more = len(data) > limit
- if reverse_direction:
- data.reverse()
if more:
- if reverse_direction:
- data = data[1:]
- else:
- data = data[:limit]
-
- should_have_next_link = True
- if not reverse_direction and not more:
- should_have_next_link = False
-
- # Calculating should_have_previous_link is tricky. It's possible we could
- # be creating a previous link to a page that would have 0 entities.
- # That would be suboptimal; what's a better way?
- should_have_previous_link = False
- if offset_keyname:
- should_have_previous_link = True
- if reverse_direction and not more:
- should_have_previous_link = False
-
- if data:
- first_key_name = data[0].key().name_or_id()
- last_key_name = data[-1].key().name_or_id()
- else:
- first_key_name = None
- last_key_name = None
+ del data[limit:]
newest = next = prev = export_link = ''
- link_creator = LinkCreator(request, idx, limit)
+ base_params = dict(i for i in request.GET.iteritems() if
+ i[0].startswith('offset_') or i[0].startswith('limit_'))
if params.get('list_key_order'):
- export_link = link_creator.create(export=True)
+ export_link = generateLinkForRequest(request, base_params, {'export' : idx})
- if should_have_next_link:
- next = link_creator.create(offset_keyname=last_key_name)
+ if more:
+ # TODO(dbentley): here we need to implement a new field "last_key"
+ next = generateLinkForRequest(request, base_params, {offset_key : offset+limit,
+ limit_key : limit})
- if should_have_previous_link:
- prev = link_creator.create(offset_keyname=first_key_name,
- reverse_direction=True)
+ if offset > 0:
+ # TODO(dbentley): here we need to implement previous in the good way.
+ prev = generateLinkForRequest(request, base_params,
+ { offset_key : max(0, offset-limit),
+ limit_key : limit })
- newest = link_creator.create(offset_keyname='')
+ if offset > limit:
+ # Having a link to the first doesn't make sense on the first page (we're on
+ # it). It also doesn't make sense on the second page (because the first
+ # page is the previous page).
- # TODO(dbentley): add a "last" link (which is now possible because we can
- # query with a reverse keyname sorting
+ # NOTE(dbentley): I personally disagree that it's simpler to do that way,
+ # because sometimes you want to go to the first page without having to
+ # consider what page you're on now.
+ newest = generateLinkForGetArgs(request, base_params, {offset_key : 0,
+ limit_key : limit})
content = {
'idx': idx,
'data': data,
'export': export_link,
- 'first': first_key_name,
- 'last': last_key_name,
+ 'first': offset+1,
+ 'last': len(data) > 1 and offset+len(data) or None,
'logic': logic,
'limit': limit,
'newest': newest,
--- a/app/soc/views/helper/params.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/helper/params.py Tue May 26 02:37:39 2009 +0200
@@ -128,7 +128,7 @@
new_params['missing_redirect'] = '/%(url_name)s/create' % params
new_params['delete_redirect'] = '/%(url_name)s/list' % params
new_params['invite_redirect'] = '/request/list'
- new_params['edit_cancel_redirect'] = '/%(url_name)s/list' % params
+ # new_params['cancel_redirect'] = '/%(url_name)s/list' % params
new_params['public_redirect'] = None
new_params['sidebar'] = None
--- a/app/soc/views/helper/responses.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/helper/responses.py Tue May 26 02:37:39 2009 +0200
@@ -33,6 +33,7 @@
from soc.logic import system
from soc.logic.models import site
from soc.logic.models.user import logic as user_logic
+from soc.modules import callback
from soc.views import helper
from soc.views.helper import redirects
from soc.views.helper import templates
@@ -125,7 +126,7 @@
context['sign_in'] = users.create_login_url(request.path)
context['sign_out'] = users.create_logout_url(request.path)
- context['sidebar_menu_items'] = sidebar.getSidebar(account, user)
+ context['sidebar_menu_items'] = callback.getCore().getSidebar(account, user)
context['gae_version'] = system.getAppVersion()
context['soc_release'] = system.getMelangeVersion()
--- a/app/soc/views/models/base.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/models/base.py Tue May 26 02:37:39 2009 +0200
@@ -41,7 +41,8 @@
from soc.views.helper import redirects
from soc.views.helper import requests
from soc.views.helper import responses
-from soc.views import sitemap
+from soc.views.sitemap import sidebar
+from soc.views.sitemap import sitemap
import soc.cache.logic
import soc.logic
@@ -914,7 +915,7 @@
context['entity_type_plural'] = params['name_plural']
context['entity_type_short'] = params['name_short']
context['entity_type_url'] = params['url_name']
- context['edit_cancel_redirect'] = params.get('edit_cancel_redirect')
+ context['cancel_redirect'] = params.get('cancel_redirect')
context['return_url'] = request.path
if params.get('export_content_type') and entity:
@@ -952,7 +953,7 @@
of _getSidebarItems on how it uses it.
"""
- return sitemap.sidebar.getSidebarMenus(id, user, params=params)
+ return sidebar.getSidebarMenus(id, user, params=params)
@decorators.merge_params
def getDjangoURLPatterns(self, params=None):
@@ -967,5 +968,5 @@
params: a dict with params for this View
"""
- return sitemap.sitemap.getDjangoURLPatterns(params)
+ return sitemap.getDjangoURLPatterns(params)
--- a/app/soc/views/models/organization.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/models/organization.py Tue May 26 02:37:39 2009 +0200
@@ -554,6 +554,11 @@
ap_list = lists.getListContent(request, ap_params, filter, idx=0,
need_content=True)
+ # this is a temporary fix for sorting Student Projects
+ # by Student name until we have a view that default
+ # sorts it self by name (right now we can't do such query)
+ ap_list['data'].sort(key=lambda sp: sp.student.name().lower())
+
contents = []
if ap_list:
--- a/app/soc/views/models/student_project.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/models/student_project.py Tue May 26 02:37:39 2009 +0200
@@ -552,7 +552,7 @@
responses.useJavaScript(context, params['js_uses_all'])
context['page_name'] = page_name
# cancel should go to the public view
- params['edit_cancel_redirect'] = redirects.getPublicRedirect(entity, params)
+ params['cancel_redirect'] = redirects.getPublicRedirect(entity, params)
if request.POST:
return self.stEditPost(request, context, params, entity, **kwargs)
--- a/app/soc/views/sitemap/build.py Mon May 25 23:42:15 2009 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,112 +0,0 @@
-#!/usr/bin/python2.5
-#
-# Copyright 2008 the Melange authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Module that constructs the sitemap.
-"""
-
-__authors__ = [
- '"Sverre Rabbelier" <sverre@rabbelier.nl>',
- ]
-
-
-from django.conf.urls import defaults
-
-#from soc.views.models import club
-#from soc.views.models import club_app
-#from soc.views.models import club_admin
-#from soc.views.models import club_member
-from soc.views.models import cron
-from soc.views.models import document
-from soc.views.models import host
-from soc.views.models import job
-from soc.views.models import mentor
-from soc.views.models import notification
-from soc.views.models import organization
-from soc.views.models import org_admin
-from soc.views.models import org_app
-from soc.views.models import priority_group
-from soc.views.models import program
-from soc.views.models import request
-from soc.views.models import site
-from soc.views.models import sponsor
-from soc.views.models import student
-from soc.views.models import student_project
-from soc.views.models import student_proposal
-from soc.views.models import timeline
-from soc.views.models import user
-from soc.views.models import user_self
-
-from soc.views.sitemap import sidebar
-from soc.views.sitemap import sitemap
-
-
-# TODO: instead of commenting out club stuff, make it depend on a setting
-
-
-sidebar.addMenu(user_self.view.getSidebarMenus)
-#sidebar.addMenu(club.view.getSidebarMenus)
-#sidebar.addMenu(club.view.getExtraMenus)
-#sidebar.addMenu(club_admin.view.getSidebarMenus)
-#sidebar.addMenu(club_member.view.getSidebarMenus)
-#sidebar.addMenu(club_app.view.getSidebarMenus)
-sidebar.addMenu(site.view.getSidebarMenus)
-sidebar.addMenu(user.view.getSidebarMenus)
-#sidebar.addMenu(document.view.getSidebarMenus)
-sidebar.addMenu(sponsor.view.getSidebarMenus)
-sidebar.addMenu(sponsor.view.getExtraMenus)
-sidebar.addMenu(host.view.getSidebarMenus)
-sidebar.addMenu(request.view.getSidebarMenus)
-sidebar.addMenu(program.view.getSidebarMenus)
-sidebar.addMenu(program.view.getExtraMenus)
-sidebar.addMenu(student.view.getSidebarMenus)
-sidebar.addMenu(student_project.view.getSidebarMenus)
-sidebar.addMenu(student_proposal.view.getSidebarMenus)
-sidebar.addMenu(organization.view.getSidebarMenus)
-sidebar.addMenu(organization.view.getExtraMenus)
-sidebar.addMenu(org_admin.view.getSidebarMenus)
-sidebar.addMenu(mentor.view.getSidebarMenus)
-sidebar.addMenu(org_app.view.getSidebarMenus)
-
-#sitemap.addPages(club.view.getDjangoURLPatterns())
-#sitemap.addPages(club_admin.view.getDjangoURLPatterns())
-#sitemap.addPages(club_app.view.getDjangoURLPatterns())
-#sitemap.addPages(club_member.view.getDjangoURLPatterns())
-sitemap.addPages(cron.view.getDjangoURLPatterns())
-sitemap.addPages(document.view.getDjangoURLPatterns())
-sitemap.addPages(host.view.getDjangoURLPatterns())
-sitemap.addPages(job.view.getDjangoURLPatterns())
-sitemap.addPages(mentor.view.getDjangoURLPatterns())
-sitemap.addPages(notification.view.getDjangoURLPatterns())
-sitemap.addPages(organization.view.getDjangoURLPatterns())
-sitemap.addPages(org_admin.view.getDjangoURLPatterns())
-sitemap.addPages(org_app.view.getDjangoURLPatterns())
-sitemap.addPages(priority_group.view.getDjangoURLPatterns())
-sitemap.addPages(program.view.getDjangoURLPatterns())
-sitemap.addPages(request.view.getDjangoURLPatterns())
-sitemap.addPages(site.view.getDjangoURLPatterns())
-sitemap.addPages(sponsor.view.getDjangoURLPatterns())
-sitemap.addPages(student.view.getDjangoURLPatterns())
-sitemap.addPages(student_project.view.getDjangoURLPatterns())
-sitemap.addPages(student_proposal.view.getDjangoURLPatterns())
-sitemap.addPages(timeline.view.getDjangoURLPatterns())
-sitemap.addPages(user_self.view.getDjangoURLPatterns())
-sitemap.addPages(user.view.getDjangoURLPatterns())
-
-
-def getPatterns():
- """Retrieves all the url patterns of this site.
- """
- return defaults.patterns(None, *sitemap.SITEMAP)
--- a/app/soc/views/sitemap/sidebar.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/sitemap/sidebar.py Tue May 26 02:37:39 2009 +0200
@@ -24,38 +24,11 @@
from soc.views import out_of_band
-import soc.cache.sidebar
-
-SIDEBAR = []
SIDEBAR_ACCESS_ARGS = ['SIDEBAR_CALLING']
SIDEBAR_ACCESS_KWARGS = {'SIDEBAR_CALLING': True}
-def addMenu(callback):
- """Adds a callback to the menu builder.
-
- The callback should return a list of menu's when called.
- """
- global SIDEBAR
- SIDEBAR.append(callback)
-
-
-@soc.cache.sidebar.cache
-def getSidebar(id, user):
- """Constructs a sidebar for the current user.
- """
-
- sidebar = []
-
- for callback in SIDEBAR:
- menus = callback(id, user)
-
- for menu in (menus if menus else []):
- sidebar.append(menu)
-
- return sorted(sidebar, key=lambda x: x.get('group'))
-
def getSidebarItems(params):
"""Retrieves a list of sidebar entries for this view.
--- a/app/soc/views/sitemap/sitemap.py Mon May 25 23:42:15 2009 +0200
+++ b/app/soc/views/sitemap/sitemap.py Tue May 26 02:37:39 2009 +0200
@@ -22,17 +22,6 @@
]
-SITEMAP = []
-
-
-def addPages(pages):
- """Adds the specified pages to the sitemap.
- """
-
- global SITEMAP
- SITEMAP += pages
-
-
def getDjangoURLPatterns(params):
"""Retrieves a list of sidebar entries for this View.
--- a/app/urls.py Mon May 25 23:42:15 2009 +0200
+++ b/app/urls.py Tue May 26 02:37:39 2009 +0200
@@ -18,15 +18,15 @@
__authors__ = [
'"Augie Fackler" <durin42@gmail.com>',
'"Todd Larsen" <tlarsen@google.com>',
+ '"Sverre Rabbelier" <sverre@rabbelier.nl>',
'"Lennard de Rijk" <ljvderijk@gmail.com>',
'"Pawel Solyga" <pawel.solyga@gmail.com>',
]
-from soc.views.sitemap import build
+from soc.modules import callback
-
-urlpatterns = build.getPatterns()
+urlpatterns = callback.getCore().getPatterns()
# define the error handlers
handler404 = 'django.views.defaults.page_not_found'
--- a/scripts/build.sh Mon May 25 23:42:15 2009 +0200
+++ b/scripts/build.sh Tue May 26 02:37:39 2009 +0200
@@ -10,8 +10,8 @@
DEFAULT_APP_BUILD=../build
DEFAULT_APP_FOLDER="../app"
-DEFAULT_APP_FILES="app.yaml cron.yaml index.yaml main.py settings.py urls.py"
-DEFAULT_APP_DIRS="soc ghop gsoc feedparser python25src reflistprop jquery ranklist json"
+DEFAULT_APP_FILES="app.yaml cron.yaml index.yaml main.py settings.py shell.py urls.py gae_django.py"
+DEFAULT_APP_DIRS="soc ghop gsoc feedparser python25src reflistprop jquery ranklist shell json htmlsanitizer"
DEFAULT_ZIP_FILES="tiny_mce.zip"
APP_BUILD=${APP_BUILD:-"${DEFAULT_APP_BUILD}"}
@@ -20,6 +20,17 @@
APP_DIRS=${APP_DIRS:-"${DEFAULT_APP_DIRS}"}
ZIP_FILES=${ZIP_FILES:-"${DEFAULT_ZIP_FILES}"}
+
+if [ "$1" != "--skip-pylint" ]; then
+ cd pylint
+ bash do_pylint.sh --silent
+ if [ "$?" != "1" ] ; then
+ echo ' Build failed. Build script encountered pylint errors.'
+ exit 1
+ fi
+ cd ..
+fi
+
if [ -e $APP_FOLDER ] ; then
cd $APP_FOLDER
else
--- a/scripts/pylint/do_pylint.sh Mon May 25 23:42:15 2009 +0200
+++ b/scripts/pylint/do_pylint.sh Tue May 26 02:37:39 2009 +0200
@@ -36,7 +36,6 @@
PROJ_DIR=$(cd "$PROJ_DIR"; pwd)
APP_DIR="${PROJ_DIR}/app"
-# Note: We will add ghop and gsoc modules once there something in there
CHECK_MODULES="soc reflistprop settings.py urls.py main.py"
PYLINTRC=$(dirname "$0")/pylintrc
@@ -60,3 +59,4 @@
done
pylint $SILENT_ARGS $ARGS $CHECK_MODULES_PATHS
+exit $?
\ No newline at end of file
--- a/scripts/stats.py Mon May 25 23:42:15 2009 +0200
+++ b/scripts/stats.py Tue May 26 02:37:39 2009 +0200
@@ -277,6 +277,21 @@
job_logic.updateOrCreateFromFields(job_fields)
+def startUniqueUserIdConversion():
+ """Creates the job that is responsible for adding unique user ids.
+ """
+
+ from soc.logic.models.job import logic as job_logic
+ from soc.logic.models.priority_group import logic as priority_logic
+
+ priority_group = priority_logic.getGroup(priority_logic.CONVERT)
+ job_fields = {
+ 'priority_group': priority_group,
+ 'task_name': 'setupUniqueUserIdAdder'}
+
+ job_logic.updateOrCreateFromFields(job_fields)
+
+
def reviveJobs(amount):
"""Sets jobs that are stuck in 'aborted' to waiting.
@@ -357,6 +372,87 @@
cPickle.dump(target, f)
+def acceptedStudentsCSVExport(csv_filename, program_key_name):
+ """Exports all accepted Students for particular program into CSV file.
+ """
+ # TODO(Pawel.Solyga): Add additional Program parameter to this method
+ # so we export students from different programs
+ # TODO(Pawel.SOlyga): Make it universal so it works with both GHOP
+ # and GSoC programs
+
+ from soc.models.student_project import StudentProject
+ from soc.models.student import Student
+ from soc.models.organization import Organization
+
+ getStudentProjects = getEntities(StudentProject)
+ student_projects = getStudentProjects()
+ student_projects_amount = len(student_projects)
+ print "Fetched %d Student Projects." % student_projects_amount
+ print "Fetching Student entities from Student Projects."
+ accepted_students = {}
+ student_organization = {}
+ counter = 0
+ for sp_key in student_projects.keys():
+ key = student_projects[sp_key].student.key().name()
+ accepted_students[key] = student_projects[sp_key].student
+ org_name = student_projects[sp_key].scope.name
+ student_organization[key] = org_name
+ counter += 1
+ print str(counter) + '/' + str(student_projects_amount) + ' ' + key + ' (' + org_name + ')'
+ print "All Student entities fetched."
+
+ students_key_order = ['link_id', 'given_name', 'surname',
+ 'name_on_documents', 'email', 'res_street', 'res_city', 'res_state',
+ 'res_country', 'res_postalcode', 'phone', 'ship_street', 'ship_city',
+ 'ship_state', 'ship_country', 'ship_postalcode', 'birth_date',
+ 'tshirt_size', 'tshirt_style', 'name', 'school_name', 'school_country',
+ 'major', 'degree']
+
+ print "Preparing Students data for export."
+ students_data = [accepted_students[i].toDict(students_key_order) for i in accepted_students.keys()]
+
+ print "Adding organization name to Students data."
+ for student in students_data:
+ student['organization'] = student_organization[program_key_name + '/' + student['link_id']]
+
+ students_key_order.append('organization')
+
+ saveDataToCSV(csv_filename, students_data, students_key_order)
+ print "Accepted Students exported to %s file." % csv_filename
+
+
+def saveDataToCSV(csv_filename, data, key_order):
+ """Saves data in order into CSV file.
+
+ This is a helper function used with acceptedStudentsCSVExport().
+ """
+
+ import csv
+ import StringIO
+
+ from soc.logic import dicts
+
+ file_handler = StringIO.StringIO()
+
+ writer = csv.DictWriter(file_handler, key_order, dialect='excel')
+ writer.writerow(dicts.identity(key_order))
+
+ # encode the data to UTF-8 to ensure compatibiliy
+ for row_dict in data:
+ for key in row_dict.keys():
+ value = row_dict[key]
+ if isinstance(value, basestring):
+ row_dict[key] = value.encode("utf-8")
+ else:
+ row_dict[key] = str(value)
+ writer.writerow(row_dict)
+
+ csv_data = file_handler.getvalue()
+ csv_file = open(csv_filename, 'w')
+ csv_file.write(csv_data)
+ csv_file.close()
+
+
def main(args):
"""Main routine.
"""
@@ -411,6 +507,8 @@
'startSpam': startSpam,
'reviveJobs': reviveJobs,
'deidleJobs': deidleJobs,
+ 'acceptedStudentsCSVExport': acceptedStudentsCSVExport,
+ 'startUniqueUserIdConversion': startUniqueUserIdConversion,
}
interactive.remote(args, context)
--- a/tests/run.py Mon May 25 23:42:15 2009 +0200
+++ b/tests/run.py Tue May 26 02:37:39 2009 +0200
@@ -9,6 +9,7 @@
os.path.join(appengine_location, 'lib', 'django'),
os.path.join(appengine_location, 'lib', 'webob'),
os.path.join(appengine_location, 'lib', 'yaml', 'lib'),
+ os.path.join(appengine_location, 'lib', 'antlr3'),
appengine_location,
os.path.join(HERE, 'app'),
os.path.join(HERE, 'thirdparty', 'coverage'),
@@ -32,7 +33,9 @@
def afterTest(self, test):
from google.appengine.api import apiproxy_stub_map
datastore = apiproxy_stub_map.apiproxy.GetStub('datastore')
- datastore.Clear()
+ # clear datastore iff one is available
+ if datastore is not None:
+ datastore.Clear()
def main():
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_functional.py Tue May 26 02:37:39 2009 +0200
@@ -0,0 +1,100 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2009 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+__authors__ = [
+ '"Matthew Wilkes" <matthew@matthewwilkes.co.uk>',
+ ]
+
+
+from gaeftest.test import FunctionalTestCase
+
+from zope.testbrowser import browser
+
+import os.path
+
+
+class MelangeFunctionalTestCase(FunctionalTestCase):
+ """A base class for all functional tests in Melange.
+
+ Tests MUST NOT be defined here, but the superclass requires a path
+ attribute that points to the app.yaml. Utility functions MAY be
+ declared here to be shared by all functional tests, but any
+ overridden unittest methods MUST call the superclass version.
+ """
+
+ path = os.path.abspath(__file__+"/../../app/app.yaml")
+
+
+class TestBranding(MelangeFunctionalTestCase):
+ """Tests that ensure Melange properly displays attribution.
+
+ Other notices, as required by the project and/or law, are tested
+ here as well.
+ """
+
+ def test_attribution(self):
+ """Ensure that the front page asserts that it is a Melange app.
+ """
+
+ tb = browser.Browser()
+ tb.open("http://127.0.0.1:8080/site/show/site")
+
+ self.assertTrue("Powered by Melange" in tb.contents)
+
+
+class TestLogin(MelangeFunctionalTestCase):
+ """Tests that check the login system is functioning correctly.
+
+ Also tests that users go through the correct registration workflow.
+ """
+
+ def test_firstLogin(self):
+ """Ensure that new users are prompted to create a profile.
+
+ Also test that only new users are prompted.
+ """
+
+ tb = browser.Browser()
+ tb.open("http://127.0.0.1:8080")
+
+ tb.getLink("Sign in").click()
+ self.assertTrue("login" in tb.url)
+
+ # fill in dev_appserver login form
+ tb.getForm().getControl("Email").value = "newuser@example.com"
+ tb.getForm().getControl("Login").click()
+
+ self.assertTrue(tb.url.endswith("/show/site"))
+ self.assertTrue('Please create <a href="/user/create_profile">'
+ 'User Profile</a> in order to view this page' in tb.contents)
+
+ tb.getLink("User Profile").click()
+
+ # fill in the user profile
+ cp = tb.getForm(action="create_profile")
+ cp.getControl(name="link_id").value = "exampleuser"
+ cp.getControl(name="name").value = "Example user"
+ cp.getControl("Save").click()
+
+ # if all is well, we go to the edit page
+ self.assertTrue("edit_profile" in tb.url)
+
+ tb.open("http://127.0.0.1:8080")
+
+ # call to action no longer on front page
+ self.assertFalse('Please create <a href="/user/create_profile">'
+ 'User Profile</a> in order to view this page' in tb.contents)
\ No newline at end of file