# HG changeset patch # User Sverre Rabbelier # Date 1243298259 -7200 # Node ID 4cc66ab098e8444ab03d4a6817be70539b0f3513 # Parent 616df973e457a15183f37eee3e3cf0477b6df32f# Parent 3156760b4d269dd3c88adda4509eed2ce4702944 remove accidental head diff -r 3156760b4d26 -r 4cc66ab098e8 app/app.yaml.template --- a/app/app.yaml.template Mon May 25 23:42:15 2009 +0200 +++ b/app/app.yaml.template Tue May 26 02:37:39 2009 +0200 @@ -46,6 +46,14 @@ - url: /json static_dir: json +- url: /admin/shell.* + script: shell/shell.py + login: admin + +- url: /static + static_dir: shell/static + expiration: 1d + - url: /.* script: main.py diff -r 3156760b4d26 -r 4cc66ab098e8 app/gae_django.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/gae_django.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,61 @@ +#!/usr/bin/python2.5 +# +# Copyright 2008 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing Melange Django 1.0+ configuration for Google App Engine. +""" + +import logging +import os +import sys + +__authors__ = [ + # alphabetical order by last name, please + '"Pawel Solyga" ', + ] + + +# Remove the standard version of Django. +for k in [k for k in sys.modules if k.startswith('django')]: + del sys.modules[k] + +# Force sys.path to have our own directory first, in case we want to import +# from it. This lets us replace the built-in Django +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + +sys.path.insert(0, os.path.abspath('django.zip')) + +# Force Django to reload its settings. +from django.conf import settings +settings._target = None + +# Must set this env var before importing any part of Django +os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' + +import django.core.signals +import django.db + +# Log errors. +def log_exception(*args, **kwds): + """Function used for logging exceptions. + """ + logging.exception('Exception in request:') + +# Log all exceptions detected by Django. +django.core.signals.got_request_exception.connect(log_exception) + +# Unregister the rollback event handler. +django.core.signals.got_request_exception.disconnect( + django.db._rollback_on_exception) diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/__init__.py diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/comment.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/comment.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,40 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP specific Comment Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', +] + + +from google.appengine.ext import db + +from django.utils.translation import ugettext + +import soc.models.comment + + +class GHOPComment(soc.models.comment.Comment): + """GHOP Comment model for tasks, extends the basic Comment model. + """ + + #: Property containing the human readable string that should be + #: shown for the comment when something in the task changes, + #: code.google.com issue tracker style + change_in_task = db.StringProperty(required=True, + verbose_name=ugettext('Changes in the task')) diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/org_prize_assignment.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/org_prize_assignment.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,54 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP PrizePerOrg Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', +] + + +from google.appengine.ext import db + +import soc.models.base + +import ghop.models.organization +import ghop.models.program + + +class GHOPOrgPrizeAssignment(soc.models.base.ModelWithFieldAttributes): + """Model for prizes assigned to Students by an Organization. + """ + + #: Program to which these winners belong to + program = db.ReferenceProperty(reference_class=ghop.models.program.GHOPProgram, + required=True, + collection_name='program_prizes') + + #: Organization to which these winners belong to + org = db.ReferenceProperty( + reference_class=ghop.models.organization.GHOPOrganization, + required=True, collection_name='organization_prizes') + + #: Ordered list of winners(reference to Student entities) for the given + #: organization under the specified program + winners = db.ListProperty(item_type=db.Key, default=[]) + + #: unordered list of runner-ups(reference to Student entities) for the given + #: organization under the specified program + runner_ups = db.ListProperty(item_type=db.Key, default=[]) + diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/organization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/organization.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,35 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP specific Organization Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', +] + + +from google.appengine.ext import db + +import soc.models.organization + + +class GHOPOrganization(soc.models.organization.Organization): + """GHOP Organization model extends the basic Organization model. + """ + + #: Property that stores the amount of tasks the organization can publish. + task_quota_limit = db.IntegerProperty(required=False, default=0) diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/program.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/program.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,76 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP specific Program Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', +] + + +from google.appengine.ext import db + +from django.utils.translation import ugettext + +import soc.models.program + + +class GHOPProgram(soc.models.program.Program): + """GHOP Program model extends the basic Program model. + """ + + #: Property that contains the latest date of birth before which a Student + #: can participate + student_min_age = db.DateTimeProperty(required=False) + student_min_age.help_text = ugettext( + 'Minimum age of the student to sign-up. Given by the latest birthdate allowed') + + #: Required property containing the number of Tasks Students can work + #: on simultaneously. For GHOP it is 1 + nr_simultaneous_tasks = db.IntegerProperty( + required=True, default=1, + verbose_name=ugettext('Simultaneous tasks')) + nr_simultaneous_tasks.help_text = ugettext( + 'Number of tasks students can work on simultaneously in the program.') + + #: Property containing the number of winners per Organization + nr_winners = db.IntegerProperty( + required=True, default=0, + verbose_name=ugettext('Winners per organization')) + nr_winners.help_text = ugettext( + 'Number of winners an organization can announce.') + + #: Property containing the number of runner ups per Organization + nr_runnerups = db.IntegerProperty( + required=True, default=0, + verbose_name=ugettext('Runner-ups per organization')) + nr_runnerups.help_text = ugettext( + 'Number of runner-ups an organization can announce.') + + #: A list of difficulty levels that can be assigned for each Task created + task_difficulties = db.StringListProperty( + required=True, default=[''], + verbose_name=ugettext('Difficulty levels')) + task_difficulties.help_text = ugettext( + 'List all the difficulty levels that can be assigned to a task.') + + #: A list of task types that a Task can belong to + task_types = db.StringListProperty( + required=True, default=['Any'], + verbose_name=ugettext('Task Types')) + task_rypes.help_text = ugettext( + 'List all the types a task can be in.') diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/task.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/task.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,181 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP Task Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', + '"Lennard de Rijk" ', +] + + +from google.appengine.ext import db + +from django.utils.translation import ugettext + +import soc.models.linkable +import soc.models.role +import soc.models.student +import soc.models.user + +import ghop.models.program + + +class GHOPTask(soc.models.linkable.Linkable): + """Model for a task used in GHOP workflow. + + The scope property of Linkable will be set to the Organization to which + this task belongs to. A link_id will be generated automatically and will + have no specific meaning other than identification. + """ + + #: Required field indicating the "title" of the task + title = db.StringProperty(required=True, + verbose_name=ugettext('Title')) + title.help_text = ugettext('Title of the task') + + #: Required field containing the description of the task + description = db.TextProperty(required=True, + verbose_name=ugettext('Description')) + description.help_text = ugettext('Complete description of the task') + + #: Field indicating the difficulty level of the Task. This is not + #: mandatory so the it can be assigned at any later stage. + #: The options are configured by a Program Admin. + difficulty = db.StringProperty(required=False, + verbose_name=ugettext('Difficulty')) + difficulty.help_text = ugettext('Difficulty Level of the task') + + #: Required field which contains the type of the task. These types are + #: configured by a Program Admin. + type = db.StringListProperty(required=True, + verbose_name=ugettext('Task Type')) + type.help_text = ugettext('Type of the task') + + #: A field which contains time allowed for completing the task (in hours) + #: from the moment that this task has been assigned to a Student + time_to_complete = db.IntegerProperty(required=True, + verbose_name=('Time to Complete')) + time_to_complete.help_text = ugettext( + 'Time allowed to complete the task, in hours, once it is claimed') + + #: List of Mentors assigned to this task. A Mentor who creates this + #: task is assigned as the Mentor by default. An Org Admin will have + #: to assign a Mentor upon task creation. + mentors = db.ListProperty(item_type=db.Key, default=[]) + + #: User profile to whom this task has been claimed by. This field + #: is mandatory for claimed tasks + user = db.ReferenceProperty(reference_class=soc.models.user.User, + required=False, + collection_name='assigned_tasks') + + #: Student profile to whom this task is currently assigned to. If the user + #: has registered as a Student than this field will be filled in. This field + #: is mandatory for all Tasks in the closed state. + student = db.ReferenceProperty(reference_class=soc.models.student.Student, + required=False, + collection_name='assigned_tasks') + + #: Program in which this Task has been created + program = db.ReferenceProperty(reference_class=ghop.models.program.GHOPProgram, + required=True, + collection_name='tasks') + + #: Required property which holds the state, the Task is currently in. + #: This is a hidden field not shown on forms. Handled by logic internally. + #: The state can be one of the following: + #: unapproved: If Task is created by a Mentor, this is the automatically + #: assigned state. + #: unpublished: This Task is not published yet. + #: open: This Task is open and ready to be claimed. + #: reopened: This Task has been claimed but never finished and has been + #: reopened. + #: claim_requested: A Student has requested to claim this task. + #: claimed: This Task has been claimed and someone is working on it. + #: action_needed: Work on this Task must be submitted for review within + #: 24 hours. + #: closed: Work on this Task has been completed to the org's content. + #: awaiting_registration: Student has completed work on this task, but + #: needs to complete Student registration before this task is closed. + #: needs_work: This work on this Tasks needs a bit more brushing up. This + #: state is followed by a Mentor review. + #: needs_review: Student has submitted work for this task and it should + #: be reviewed by a Mentor. + status = db.StringProperty( + required=True, verbose_name=ugettext('Status'), + choices=['unapproved', 'unpublished', 'open', 'reopened', + 'claim_requested', 'claimed', 'action_needed', + 'closed', 'awaiting_registration', 'needs_work', + 'needs_review'], + default='unapproved') + + #: A field which indicates if the Task was ever in the Reopened state. + #: True indicates that its state was Reopened once, false indicated that it + #: has never been in the Reopened state. + was_reopened = db.BooleanProperty(default=False, + verbose_name=ugettext('Has been reopened')) + + #: This field is set to the next deadline that will have consequences for + #: this Task. For instance this will store a DateTime property which will + #: tell when this Task should be completed. + deadline = db.DateTimeProperty(required=False, + verbose_name=ugettext('Deadline')) + + #: Required field containing the Mentor/Org Admin who created this task + created_by = db.ReferenceProperty(reference_class=soc.models.role.Role, + required=True, + collection_name='created_tasks', + verbose_name=ugettext('Created by')) + + #: Date when the proposal was created + created_on = db.DateTimeProperty(required=True, auto_now_add=True, + verbose_name=ugettext('Created on')) + + #: Required field containing the Mentor/Org Admin who last edited this + #: task. It changes only when Mentor/Org Admin changes title, description, + #: difficulty, type, time_to_complete. + modified_by = db.ReferenceProperty(reference_class=soc.models.role.Role, + required=True, + collection_name='edited_tasks', + verbose_name=ugettext('Modified by')) + + #: Date when the proposal was last modified, should be set manually on edit + modified_on = db.DateTimeProperty(required=True, auto_now_add=True, + verbose_name=ugettext('Modified on')) + + #: A field which holds the entire history of this task in JSON. The + #: structure of this JSON string is as follows: + #: { + #: timestamp1: { + #: "user": User reference + #: "student": Student reference + #: ... + #: "state": "Unapproved" + #: ... + #: "edited_by": Role reference + #: + #: } + #: timestamp2: { + #: "state": "Unpublished" + #: } + #: } + #: First dictionary item holds the values for all the properties in this + #: model. The subsequent items hold the properties that changed at the + #: timestamp given by the key. + #: Reference properties will be stored by calling str() on their Key. + history = db.TextProperty(required=True, default='') diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/timeline.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/timeline.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,62 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP specific Timeline Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', +] + + +from google.appengine.ext import db + +from django.utils.translation import ugettext + +import soc.models.timeline + + +class GHOPTimeline(soc.models.timeline.Timeline): + """GHOP Timeline model extends the basic Timeline model. It implements + the GHOP specific timeline entries. + """ + + task_claim_deadline = db.DateTimeProperty( + verbose_name=ugettext('Task Claim Deadline date')) + task_claim_deadline.help_text = ugettext( + 'No tasks can be claimed after this date.' + 'Work on claimed tasks can continue.') + + stop_all_work = db.DateTimeProperty( + verbose_name=ugettext('Work Submission Deadline date')) + stop_all_work.help_text = ugettext( + 'All work must stop by this date.') + + winner_selection_start = db.DateTimeProperty( + verbose_name=ugettext('Winner Selection Start date')) + winner_selection_start.help_text = ugettext( + 'Organizations start choosing their winners.') + + winner_selection_end = db.DateTimeProperty( + verbose_name=ugettext('Winner Selection End date')) + winner_selection_end.help_text = ugettext( + 'Organizations must have completed choosing their winners.') + + winner_announcement = db.DateTimeProperty( + verbose_name=ugettext('Winner Announcement date')) + winner_announcement.help_text = ugettext( + 'All winners are announced.') + diff -r 3156760b4d26 -r 4cc66ab098e8 app/ghop/models/work_submission.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/ghop/models/work_submission.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,72 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains the GHOP WorkSubmission Model. +""" + +__authors__ = [ + '"Madhusudan.C.S" ', + '"Lennard de Rijk" ', +] + + +from google.appengine.ext import db + +from django.utils.translation import ugettext + +import soc.models.linkable +import soc.models.user + +import ghop.models.program +import ghop.models.task + + +class GHOPWorkSubmission(soc.models.linkable.Linkable): + """Model for work submissions for a task by students. + + Scope will be set to the Organization to which this work has been submitted. + """ + + #: Task to which this work was submitted + task = db.ReferenceProperty(reference_class=ghop.models.task.GHOPTask, + required=True, + collection_name='work_submissions') + + #: User who submitted this work + user = db.ReferenceProperty(reference_class=soc.models.user.User, + required=True, + collection_name='work_submissions') + + #: Program to which this work belongs to + program = db.ReferenceProperty(reference_class=ghop.models.program.GHOPProgram, + required=True, + collection_name='work_submissions') + + #: Property allowing you to store information about your work + information = db.TextProperty( + required=True, verbose_name=ugettext('Info')) + information.help_text = ugettext( + 'Information about the work you submit for this task') + + #: Property containing an URL to this work or more information about it + url_to_work = db.LinkProperty( + required=False, verbose_name=ugettext('URL to your Work')) + url_to_work.help_text = ugettext( + 'URL to a resource containing your work or more information about it') + + #: Property containing the date when the work was submitted + submitted_on = db.DateTimeProperty(required=True, auto_now_add=True, + verbose_name=ugettext('Submitted on')) diff -r 3156760b4d26 -r 4cc66ab098e8 app/htmlsanitizer/BeautifulSoup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/htmlsanitizer/BeautifulSoup.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,2000 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses a (possibly invalid) XML or HTML document into a +tree representation. It provides methods and Pythonic idioms that make +it easy to navigate, search, and modify the tree. + +A well-formed XML/HTML document yields a well-formed data +structure. An ill-formed XML/HTML document yields a correspondingly +ill-formed data structure. If your document is only locally +well-formed, you can use this library to find and process the +well-formed part of it. + +Beautiful Soup works with Python 2.2 and up. It has no external +dependencies, but you'll have more success at converting data to UTF-8 +if you also install these three packages: + +* chardet, for auto-detecting character encodings + http://chardet.feedparser.org/ +* cjkcodecs and iconv_codec, which add more encodings to the ones supported + by stock Python. + http://cjkpython.i18n.org/ + +Beautiful Soup defines classes for two main parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. This class has web browser-like heuristics for + obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup also defines a class (UnicodeDammit) for autodetecting +the encoding of an HTML or XML document, and converting it to +Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/documentation.html + +Here, have some legalese: + +Copyright (c) 2004-2009, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the the Beautiful Soup Consortium and All + Night Kosher Bakery nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. + +""" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "3.1.0.1" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__license__ = "New-style BSD" + +import codecs +import markupbase +import types +import re +from HTMLParser import HTMLParser, HTMLParseError +try: + from htmlentitydefs import name2codepoint +except ImportError: + name2codepoint = {} +try: + set +except NameError: + from sets import Set as set + +#These hacks make Beautiful Soup able to parse XML with namespaces +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match + +DEFAULT_OUTPUT_ENCODING = "utf-8" + +# First, the classes that represent markup elements. + +def sob(unicode, encoding): + """Returns either the given Unicode string or its encoding.""" + if encoding is None: + return unicode + else: + return unicode.encode(encoding) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = None + self.previousSibling = None + self.nextSibling = None + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def replaceWith(self, replaceWith): + oldParent = self.parent + myIndex = self.parent.contents.index(self) + if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: + # We're replacing this element with one of its siblings. + index = self.parent.contents.index(replaceWith) + if index and index < myIndex: + # Furthermore, it comes before this element. That + # means that when we extract it, the index of this + # element will change. + myIndex = myIndex - 1 + self.extract() + oldParent.insert(myIndex, replaceWith) + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent: + try: + self.parent.contents.remove(self) + except ValueError: + pass + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + lastChild = self._lastRecursiveChild() + nextElement = lastChild.next + + if self.previous: + self.previous.next = nextElement + if nextElement: + nextElement.previous = self.previous + self.previous = None + lastChild.next = None + + self.parent = None + if self.previousSibling: + self.previousSibling.nextSibling = self.nextSibling + if self.nextSibling: + self.nextSibling.previousSibling = self.previousSibling + self.previousSibling = self.nextSibling = None + return self + + def _lastRecursiveChild(self): + "Finds the last element beneath this object to be parsed." + lastChild = self + while hasattr(lastChild, 'contents') and lastChild.contents: + lastChild = lastChild.contents[-1] + return lastChild + + def insert(self, position, newChild): + if (isinstance(newChild, basestring) + or isinstance(newChild, unicode)) \ + and not isinstance(newChild, NavigableString): + newChild = NavigableString(newChild) + + position = min(position, len(self.contents)) + if hasattr(newChild, 'parent') and newChild.parent != None: + # We're 'inserting' an element that's already one + # of this object's children. + if newChild.parent == self: + index = self.find(newChild) + if index and index < position: + # Furthermore we're moving it further down the + # list of this object's children. That means that + # when we extract this element, our target index + # will jump down one. + position = position - 1 + newChild.extract() + + newChild.parent = self + previousChild = None + if position == 0: + newChild.previousSibling = None + newChild.previous = self + else: + previousChild = self.contents[position-1] + newChild.previousSibling = previousChild + newChild.previousSibling.nextSibling = newChild + newChild.previous = previousChild._lastRecursiveChild() + if newChild.previous: + newChild.previous.next = newChild + + newChildsLastElement = newChild._lastRecursiveChild() + + if position >= len(self.contents): + newChild.nextSibling = None + + parent = self + parentsNextSibling = None + while not parentsNextSibling: + parentsNextSibling = parent.nextSibling + parent = parent.parent + if not parent: # This is the last element in the document. + break + if parentsNextSibling: + newChildsLastElement.next = parentsNextSibling + else: + newChildsLastElement.next = None + else: + nextChild = self.contents[position] + newChild.nextSibling = nextChild + if newChild.nextSibling: + newChild.nextSibling.previousSibling = newChild + newChildsLastElement.next = nextChild + + if newChildsLastElement.next: + newChildsLastElement.next.previous = newChildsLastElement + self.contents.insert(position, newChild) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def findNext(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + + def findAllNext(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.nextGenerator, + **kwargs) + + def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._findOne(self.findNextSiblings, name, attrs, text, + **kwargs) + + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.nextSiblingGenerator, **kwargs) + fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + + def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + + def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.previousGenerator, + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x + + def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._findOne(self.findPreviousSiblings, name, attrs, text, + **kwargs) + + def findPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.previousSiblingGenerator, **kwargs) + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + def findParent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _findOne because findParents takes a different + # set of arguments. + r = None + l = self.findParents(name, attrs, 1) + if l: + r = l[0] + return r + + def findParents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._findAll(name, attrs, None, limit, self.parentGenerator, + **kwargs) + fetchParents = findParents # Compatibility with pre-3.x + + #These methods do the real heavy lifting. + + def _findOne(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These Generators can be used to navigate starting from both + #NavigableStrings and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + # Utility methods + def substituteEncoding(self, str, encoding=None): + encoding = encoding or "utf-8" + return str.replace("%SOUP-ENCODING%", encoding) + + def toEncoding(self, s, encoding=None): + """Encodes an object to a string in some encoding, or to Unicode. + .""" + if isinstance(s, unicode): + if encoding: + s = s.encode(encoding) + elif isinstance(s, str): + if encoding: + s = s.encode(encoding) + else: + s = unicode(s) + else: + if encoding: + s = self.toEncoding(str(s), encoding) + else: + s = unicode(s) + return s + +class NavigableString(unicode, PageElement): + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.decode().encode(encoding) + + def decodeGivenEventualEncoding(self, eventualEncoding): + return self + +class CData(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'' + +class ProcessingInstruction(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + output = self + if u'%SOUP-ENCODING%' in output: + output = self.substituteEncoding(output, eventualEncoding) + return u'' + +class Comment(NavigableString): + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'' + +class Declaration(NavigableString): + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'' + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def _invert(h): + "Cheap function to invert a hash." + i = {} + for k,v in h.items(): + i[v] = k + return i + + XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", + "quot" : '"', + "amp" : "&", + "lt" : "<", + "gt" : ">" } + + XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) + + def _convertEntities(self, match): + """Used in a call to re.sub to replace HTML, XML, and numeric + entities with the appropriate Unicode characters. If HTML + entities are being converted, any unrecognized entities are + escaped.""" + x = match.group(1) + if self.convertHTMLEntities and x in name2codepoint: + return unichr(name2codepoint[x]) + elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: + if self.convertXMLEntities: + return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] + else: + return u'&%s;' % x + elif len(x) > 0 and x[0] == '#': + # Handle numeric entities + if len(x) > 1 and x[1] == 'x': + return unichr(int(x[2:], 16)) + else: + return unichr(int(x[1:])) + + elif self.escapeUnrecognizedEntities: + return u'&%s;' % x + else: + return u'&%s;' % x + + def __init__(self, parser, name, attrs=None, parent=None, + previous=None): + "Basic constructor." + + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected + self.parserClass = parser.__class__ + self.isSelfClosing = parser.isSelfClosingTag(name) + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + self.containsSubstitutions = False + self.convertHTMLEntities = parser.convertHTMLEntities + self.convertXMLEntities = parser.convertXMLEntities + self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities + + def convert(kval): + "Converts HTML, XML and numeric entities in the attribute value." + k, val = kval + if val is None: + return kval + return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, val)) + self.attrs = map(convert, self.attrs) + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def has_key(self, key): + return self._getAttrMap().has_key(key) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + findAll() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.findAll, args, kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.find(tag[:-3]) + elif tag.find('__') != 0: + return self.find(tag) + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.decode(eventualEncoding=encoding) + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + + ")") + + def _sub_entity(self, x): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + + def __unicode__(self): + return self.decode() + + def __str__(self): + return self.encode() + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + + def decode(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + fmt = '%s="%s"' + if isString(val): + if (self.containsSubstitutions + and eventualEncoding is not None + and '%SOUP-ENCODING%' in val): + val = self.substituteEncoding(val, eventualEncoding) + + # The attribute value either: + # + # * Contains no embedded double quotes or single quotes. + # No problem: we enclose it in double quotes. + # * Contains embedded single quotes. No problem: + # double quotes work here too. + # * Contains embedded double quotes. No problem: + # we enclose it in single quotes. + # * Embeds both single _and_ double quotes. This + # can't happen naturally, but it can happen if + # you modify an attribute value after parsing + # the document. Now we have a bit of a + # problem. We solve it by enclosing the + # attribute in single quotes, and escaping any + # embedded single quotes to XML entities. + if '"' in val: + fmt = "%s='%s'" + if "'" in val: + # TODO: replace with apos when + # appropriate. + val = val.replace("'", "&squot;") + + # Now we're okay w/r/t quotes. But the attribute + # value might also contain angle brackets, or + # ampersands that aren't part of entities. We need + # to escape those to XML entities too. + val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) + if val is None: + # Handle boolean attributes. + decoded = key + else: + decoded = fmt % (key, val) + attrs.append(decoded) + close = '' + closeTag = '' + if self.isSelfClosing: + close = ' /' + else: + closeTag = '' % self.name + + indentTag, indentContents = 0, 0 + if prettyPrint: + indentTag = indentLevel + space = (' ' * (indentTag-1)) + indentContents = indentTag + 1 + contents = self.decodeContents(prettyPrint, indentContents, + eventualEncoding) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if prettyPrint: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + if prettyPrint: + s.append("\n") + s.append(contents) + if prettyPrint and contents and contents[-1] != "\n": + s.append("\n") + if prettyPrint and closeTag: + s.append(space) + s.append(closeTag) + if prettyPrint and closeTag and self.nextSibling: + s.append("\n") + s = ''.join(s) + return s + + def decompose(self): + """Recursively destroys the contents of this tree.""" + contents = [i for i in self.contents] + for i in contents: + if isinstance(i, Tag): + i.decompose() + else: + i.extract() + self.extract() + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.encode(encoding, True) + + def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + + def decodeContents(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Renders the contents of this tag as a string in the given + encoding. If encoding is None, returns a Unicode string..""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.decodeGivenEventualEncoding(eventualEncoding) + elif isinstance(c, Tag): + s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) + if text and prettyPrint: + text = text.strip() + if text: + if prettyPrint: + s.append(" " * (indentLevel-1)) + s.append(text) + if prettyPrint: + s.append("\n") + return ''.join(s) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def findAll(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._findAll(name, attrs, text, limit, generator, **kwargs) + findChildren = findAll + + # Pre-3.x compatibility methods. Will go away in 4.0. + first = find + fetch = findAll + + def fetchText(self, text=None, recursive=True, limit=None): + return self.findAll(text=text, recursive=recursive, limit=limit) + + def firstText(self, text=None, recursive=True): + return self.find(text=text, recursive=recursive) + + # 3.x compatibility methods. Will go away in 4.0. + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if encoding is None: + return self.decodeContents(prettyPrint, indentLevel, encoding) + else: + return self.encodeContents(encoding, prettyPrint, indentLevel) + + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def recursiveChildGenerator(self): + if not len(self.contents): + raise StopIteration + stopNode = self._lastRecursiveChild().next + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next + + def childGenerator(self): + if not len(self.contents): + raise StopIteration + current = self.contents[0] + while current: + yield current + current = current.nextSibling + raise StopIteration + +# Next, a couple classes to represent queries and their results. +class SoupStrainer: + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = name + if isString(attrs): + kwargs['class'] = attrs + attrs = None + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + self.attrs = attrs + self.text = text + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def searchTag(self, markupName=None, markupAttrs={}): + found = None + markup = None + if isinstance(markupName, Tag): + markup = markupName + markupAttrs = markup + callFunctionWithTagData = callable(self.name) \ + and not isinstance(markupName, Tag) + + if (not self.name) \ + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): + if callFunctionWithTagData: + match = self.name(markupName, markupAttrs) + else: + match = True + markupAttrMap = None + for attr, matchAgainst in self.attrs.items(): + if not markupAttrMap: + if hasattr(markupAttrs, 'get'): + markupAttrMap = markupAttrs + else: + markupAttrMap = {} + for k,v in markupAttrs: + markupAttrMap[k] = v + attrValue = markupAttrMap.get(attr) + if not self._matches(attrValue, matchAgainst): + match = False + break + if match: + if markup: + found = markup + else: + found = markupName + return found + + def search(self, markup): + #print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if isList(markup) and not isinstance(markup, Tag): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text: + found = self.searchTag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isString(markup): + if self._matches(markup, self.text): + found = markup + else: + raise Exception, "I don't know how to match against a %s" \ + % markup.__class__ + return found + + def _matches(self, markup, matchAgainst): + #print "Matching %s against %s" % (markup, matchAgainst) + result = False + if matchAgainst == True and type(matchAgainst) == types.BooleanType: + result = markup != None + elif callable(matchAgainst): + result = matchAgainst(markup) + else: + #Custom match methods take the tag as an argument, but all + #other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + if markup is not None and not isString(markup): + markup = unicode(markup) + #Now we know that chunk is either a string, or None. + if hasattr(matchAgainst, 'match'): + # It's a regexp object. + result = markup and matchAgainst.search(markup) + elif (isList(matchAgainst) + and (markup is not None or not isString(matchAgainst))): + result = markup in matchAgainst + elif hasattr(matchAgainst, 'items'): + result = markup.has_key(matchAgainst) + elif matchAgainst and isString(markup): + if isinstance(markup, unicode): + matchAgainst = unicode(matchAgainst) + else: + matchAgainst = str(matchAgainst) + + if not result: + result = matchAgainst == markup + return result + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source + +# Now, some helper functions. + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType))) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isinstance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and + NESTING_RESET_TAGS maps out of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion) and not isString(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +# Now, the parser classes. + +class HTMLParserBuilder(HTMLParser): + + def __init__(self, soup): + HTMLParser.__init__(self) + self.soup = soup + + # We inherit feed() and reset(). + + def handle_starttag(self, name, attrs): + if name == 'meta': + self.soup.extractCharsetFromMeta(attrs) + else: + self.soup.unknown_starttag(name, attrs) + + def handle_endtag(self, name): + self.soup.unknown_endtag(name) + + def handle_data(self, content): + self.soup.handle_data(content) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.soup.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.soup.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.soup.convertXMLEntities: + data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.soup.convertHTMLEntities and \ + not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = HTMLParser.parse_declaration(self, i) + except HTMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + + +class BeautifulStoneSoup(Tag): + + """This class contains the basic parser and search code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "" actually means + "". + + [Another possible explanation is "", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + PRESERVE_WHITESPACE_TAGS = [] + + MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda x: x.group(1) + ' />'), + (re.compile(']*)>'), + lambda x: '') + ] + + ROOT_TAG_NAME = u'[document]' + + HTML_ENTITIES = "html" + XML_ENTITIES = "xml" + XHTML_ENTITIES = "xhtml" + # TODO: This only exists for backwards-compatibility + ALL_ENTITIES = XHTML_ENTITIES + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + + def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, + markupMassage=True, smartQuotesTo=XML_ENTITIES, + convertEntities=None, selfClosingTags=None, isHTML=False, + builder=HTMLParserBuilder): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser. + + HTMLParser will process most bad HTML, and the BeautifulSoup + class has some tricks for dealing with some HTML that kills + HTMLParser, but Beautiful Soup can nonetheless choke or lose data + if your data uses self-closing tags or declarations + incorrectly. + + By default, Beautiful Soup uses regexes to sanitize input, + avoiding the vast majority of these problems. If the problems + don't apply to you, pass in False for markupMassage, and + you'll get better performance. + + The default parser massage techniques fix the two most common + instances of invalid HTML that choke HTMLParser: + +
(No space between name of closing tag and tag close) + (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + + self.parseOnlyThese = parseOnlyThese + self.fromEncoding = fromEncoding + self.smartQuotesTo = smartQuotesTo + self.convertEntities = convertEntities + # Set the rules for how we'll deal with the entities we + # encounter + if self.convertEntities: + # It doesn't make sense to convert encoded characters to + # entities even while you're converting entities to Unicode. + # Just convert it all to Unicode. + self.smartQuotesTo = None + if convertEntities == self.HTML_ENTITIES: + self.convertXMLEntities = False + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = True + elif convertEntities == self.XHTML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = False + elif convertEntities == self.XML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + else: + self.convertXMLEntities = False + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + + self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) + self.builder = builder(self) + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + self.markup = markup + self.markupMassage = markupMassage + try: + self._feed(isHTML=isHTML) + except StopParsing: + pass + self.markup = None # The markup can now be GCed. + self.builder = None # So can the builder. + + def _feed(self, inDocumentEncoding=None, isHTML=False): + # Convert the document to Unicode. + markup = self.markup + if isinstance(markup, unicode): + if not hasattr(self, 'originalEncoding'): + self.originalEncoding = None + else: + dammit = UnicodeDammit\ + (markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) + markup = dammit.unicode + self.originalEncoding = dammit.originalEncoding + self.declaredHTMLEncoding = dammit.declaredHTMLEncoding + if markup: + if self.markupMassage: + if not isList(self.markupMassage): + self.markupMassage = self.MARKUP_MASSAGE + for fix, m in self.markupMassage: + markup = fix.sub(m, markup) + # TODO: We get rid of markupMassage so that the + # soup object can be deepcopied later on. Some + # Python installations can't copy regexes. If anyone + # was relying on the existence of markupMassage, this + # might cause problems. + del(self.markupMassage) + self.builder.reset() + + self.builder.feed(markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def isSelfClosingTag(self, name): + """Returns true iff the given string is the name of a + self-closing tag according to this parser.""" + return self.SELF_CLOSING_TAGS.has_key(name) \ + or self.instanceSelfClosingTags.has_key(name) + + def reset(self): + Tag.__init__(self, self, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.quoteStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableString): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.PRESERVE_WHITESPACE_TAGS)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parseOnlyThese and len(self.tagStack) <= 1 and \ + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): + return + o = containerClass(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: +

FooBar *

* should pop to 'p', not 'b'. +

FooBar *

* should pop to 'table', not 'p'. +

Foo

Bar *

* should pop to 'tr', not 'p'. + +

    • *
    • * should pop to 'ul', not the first 'li'. +
  • ** should pop to 'table', not the first 'tr' + tag should + implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s: %s" % (name, attrs) + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + + if not self.isSelfClosingTag(name) and not selfClosing: + self._smartPop(name) + + if self.parseOnlyThese and len(self.tagStack) <= 1 \ + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + return + + tag = Tag(self, name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or self.isSelfClosingTag(name): + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + return tag + + def unknown_endtag(self, name): + #print "End tag %s" % name + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.handle_data('' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def extractCharsetFromMeta(self, attrs): + self.unknown_starttag('meta', attrs) + + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a

    tag should implicitly close the previous

    tag. + +

    Para1

    Para2 + should be transformed into: +

    Para1

    Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a

    tag should _not_ implicitly close the previous +
    tag. + + Alice said:
    Bob said:
    Blah + should NOT be transformed into: + Alice said:
    Bob said:
    Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a
    , + but not close a tag in another table. + +
    BlahBlah + should be transformed into: +
    BlahBlah + but, + Blah
    Blah + should NOT be transformed into + Blah
    Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup, MinimalSoup, or + BeautifulStoneSoup before writing your own subclass.""" + + def __init__(self, *args, **kwargs): + if not kwargs.has_key('smartQuotesTo'): + kwargs['smartQuotesTo'] = self.HTML_ENTITIES + kwargs['isHTML'] = True + BeautifulStoneSoup.__init__(self, *args, **kwargs) + + SELF_CLOSING_TAGS = buildTagMap(None, + ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + QUOTE_TAGS = {'script' : None, 'textarea' : None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + 'thead' : ['table'], + 'tbody' : ['table'], + 'tfoot' : ['table'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + + # Used to detect the charset in a META tag; see start_meta + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def extractCharsetFromMeta(self, attrs): + """Beautiful Soup can detect a charset included in a META tag, + try to convert the document to that charset, and re-parse the + document from the beginning.""" + httpEquiv = None + contentType = None + contentTypeIndex = None + tagNeedsEncodingSubstitution = False + + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i + + if httpEquiv and contentType: # It's an interesting meta tag. + match = self.CHARSET_RE.search(contentType) + if match: + if (self.declaredHTMLEncoding is not None or + self.originalEncoding == self.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, contentType) + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) + tagNeedsEncodingSubstitution = True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + newCharset = match.group(3) + if newCharset and newCharset != self.originalEncoding: + self.declaredHTMLEncoding = newCharset + self._feed(self.declaredHTMLEncoding) + raise StopParsing + pass + tag = self.unknown_starttag("meta", attrs) + if tag and tagNeedsEncodingSubstitution: + tag.containsSubstitutions = True + + +class StopParsing(Exception): + pass + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + FooBar + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "FooBar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class MinimalSoup(BeautifulSoup): + """The MinimalSoup class is for parsing HTML that contains + pathologically bad markup. It makes no assumptions about tag + nesting, but it does know which tags are self-closing, that + Foo" + soup = BeautifulSoup(text) + self.assertEqual(soup.script.contents[0], "if (iThis is an example of an HTML tag<&<&") + +class OperatorOverload(SoupTest): + "Our operators do it all! Call now!" + + def testTagNameAsFind(self): + "Tests that referencing a tag name as a member delegates to find()." + soup = BeautifulSoup('foobarRed herring') + self.assertEqual(soup.b.i, soup.find('b').find('i')) + self.assertEqual(soup.b.i.string, 'bar') + self.assertEqual(soup.b['id'], '1') + self.assertEqual(soup.b.contents[0], 'foo') + self.assert_(not soup.a) + + #Test the .fooTag variant of .foo. + self.assertEqual(soup.bTag.iTag.string, 'bar') + self.assertEqual(soup.b.iTag.string, 'bar') + self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag) + +class NestableEgg(SoupTest): + """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!""" + + def testParaInsideBlockquote(self): + soup = BeautifulSoup('

    Foo

    Bar') + self.assertEqual(soup.blockquote.p.b.string, 'Foo') + self.assertEqual(soup.blockquote.b.string, 'Foo') + self.assertEqual(soup.find('p', recursive=False).string, 'Bar') + + def testNestedTables(self): + text = """

    Here's another table: +
    Juicy text
    """ + soup = BeautifulSoup(text) + self.assertEquals(soup.table.table.td.string, 'Juicy text') + self.assertEquals(len(soup.findAll('table')), 2) + self.assertEquals(len(soup.table.findAll('table')), 1) + self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name, + 'table') + + text = "
    Foo
    " + soup = BeautifulSoup(text) + self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo") + + text = """FooBar + Baz
    """ + soup = BeautifulSoup(text) + self.assertEquals(soup.table.thead.tr.contents[0], "Foo") + + def testBadNestedTables(self): + soup = BeautifulSoup("
    ") + self.assertEquals(soup.table.tr.table.tr['id'], 'nested') + +class CleanupOnAisleFour(SoupTest): + """Here we test cleanup of text that breaks HTMLParser or is just + obnoxious.""" + + def testSelfClosingtag(self): + self.assertEqual(BeautifulSoup("Foo
    Bar").find('br').decode(), + '
    ') + + self.assertSoupEquals('

    test1
    test2

    ', + '

    test1
    test2

    ') + + text = '

    test1test2' + soup = BeautifulStoneSoup(text) + self.assertEqual(soup.decode(), + '

    test1test2

    ') + + soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing') + self.assertEqual(soup.decode(), + '

    test1test2

    ') + + def testSelfClosingTagOrNot(self): + text = "http://foo.com/" + self.assertEqual(BeautifulStoneSoup(text).decode(), text) + self.assertEqual(BeautifulSoup(text).decode(), + 'http://foo.com/') + + def testBooleanAttributes(self): + text = "" + self.assertSoupEquals(text, text) + + def testCData(self): + xml = "foobar" + self.assertSoupEquals(xml, xml) + r = re.compile("foo.*bar") + soup = BeautifulSoup(xml) + self.assertEquals(soup.find(text=r).string, "foobar") + self.assertEquals(soup.find(text=r).__class__, CData) + + def testComments(self): + xml = "foobaz" + self.assertSoupEquals(xml) + r = re.compile("foo.*bar") + soup = BeautifulSoup(xml) + self.assertEquals(soup.find(text=r).string, "foobar") + self.assertEquals(soup.find(text="foobar").__class__, Comment) + + def testDeclaration(self): + xml = "foobaz" + self.assertSoupEquals(xml) + r = re.compile(".*foo.*bar") + soup = BeautifulSoup(xml) + text = "DOCTYPE foobar" + self.assertEquals(soup.find(text=r).string, text) + self.assertEquals(soup.find(text=text).__class__, Declaration) + + namespaced_doctype = ('' + 'foo') + soup = BeautifulSoup(namespaced_doctype) + self.assertEquals(soup.contents[0], + 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"') + self.assertEquals(soup.html.contents[0], 'foo') + + def testEntityConversions(self): + text = "<<sacré bleu!>>" + soup = BeautifulStoneSoup(text) + self.assertSoupEquals(text) + + xmlEnt = BeautifulStoneSoup.XML_ENTITIES + htmlEnt = BeautifulStoneSoup.HTML_ENTITIES + xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES + + soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) + self.assertEquals(soup.decode(), "<>") + + soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) + self.assertEquals(soup.decode(), "<>") + + soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) + self.assertEquals(soup.decode(), u"<>") + + # Make sure the "XML", "HTML", and "XHTML" settings work. + text = "<™'" + soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) + self.assertEquals(soup.decode(), u"<™'") + + soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) + self.assertEquals(soup.decode(), u"<\u2122'") + + soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt) + self.assertEquals(soup.decode(), u"<\u2122'") + + def testNonBreakingSpaces(self): + soup = BeautifulSoup("  ", + convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + self.assertEquals(soup.decode(), u"\xa0\xa0") + + def testWhitespaceInDeclaration(self): + self.assertSoupEquals('', '') + + def testJunkInDeclaration(self): + self.assertSoupEquals('a', 'a') + + def testIncompleteDeclaration(self): + self.assertSoupEquals('ac') + + def testEntityReplacement(self): + self.assertSoupEquals('hello there') + + def testEntitiesInAttributeValues(self): + self.assertSoupEquals('', '', + encoding='utf-8') + self.assertSoupEquals('', '', + encoding='utf-8') + + soup = BeautifulSoup('', + convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + self.assertEquals(soup.decode(), u'') + + uri = "http://crummy.com?sacré&bleu" + link = '' % uri + + soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) + self.assertEquals(soup.decode(), + link.replace("é", u"\xe9")) + + uri = "http://crummy.com?sacré&bleu" + link = '' % uri + soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) + self.assertEquals(soup.a['href'], + uri.replace("é", u"\xe9")) + + def testNakedAmpersands(self): + html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES} + soup = BeautifulStoneSoup("AT&T ", **html) + self.assertEquals(soup.decode(), 'AT&T ') + + nakedAmpersandInASentence = "AT&T was Ma Bell" + soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html) + self.assertEquals(soup.decode(), \ + nakedAmpersandInASentence.replace('&','&')) + + invalidURL = 'foo' + validURL = invalidURL.replace('&','&') + soup = BeautifulStoneSoup(invalidURL) + self.assertEquals(soup.decode(), validURL) + + soup = BeautifulStoneSoup(validURL) + self.assertEquals(soup.decode(), validURL) + + +class EncodeRed(SoupTest): + """Tests encoding conversion, Unicode conversion, and Microsoft + smart quote fixes.""" + + def testUnicodeDammitStandalone(self): + markup = "\x92" + dammit = UnicodeDammit(markup) + self.assertEquals(dammit.unicode, "") + + hebrew = "\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.originalEncoding, 'iso-8859-8') + + def testGarbageInGarbageOut(self): + ascii = "a" + asciiSoup = BeautifulStoneSoup(ascii) + self.assertEquals(ascii, asciiSoup.decode()) + + unicodeData = u"\u00FC" + utf8 = unicodeData.encode("utf-8") + self.assertEquals(utf8, '\xc3\xbc') + + unicodeSoup = BeautifulStoneSoup(unicodeData) + self.assertEquals(unicodeData, unicodeSoup.decode()) + self.assertEquals(unicodeSoup.foo.string, u'\u00FC') + + utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') + self.assertEquals(utf8, utf8Soup.encode('utf-8')) + self.assertEquals(utf8Soup.originalEncoding, "utf-8") + + utf8Soup = BeautifulStoneSoup(unicodeData) + self.assertEquals(utf8, utf8Soup.encode('utf-8')) + self.assertEquals(utf8Soup.originalEncoding, None) + + + def testHandleInvalidCodec(self): + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + soup = BeautifulSoup(u"RäksmörgÃ¥s".encode("utf-8"), + fromEncoding=bad_encoding) + self.assertEquals(soup.originalEncoding, 'utf-8') + + def testUnicodeSearch(self): + html = u'

    Räksmörgås

    ' + soup = BeautifulSoup(html) + self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') + + def testRewrittenXMLHeader(self): + euc_jp = '\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' + utf8 = "\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" + soup = BeautifulStoneSoup(euc_jp) + if soup.originalEncoding != "euc-jp": + raise Exception("Test failed when parsing euc-jp document. " + "If you're running Python >=2.4, or you have " + "cjkcodecs installed, this is a real problem. " + "Otherwise, ignore it.") + + self.assertEquals(soup.originalEncoding, "euc-jp") + self.assertEquals(soup.renderContents('utf-8'), utf8) + + old_text = "\x92" + new_text = "" + self.assertSoupEquals(old_text, new_text) + + def testRewrittenMetaTag(self): + no_shift_jis_html = '''\n
    \n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
    ''' + soup = BeautifulSoup(no_shift_jis_html) + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) + self.assertEquals(soup.contents[0].name, 'pre') + + meta_tag = ('') + shift_jis_html = ( + '\n%s\n' + '' + '
    \n'
    +            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
    +            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
    +            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
    +            '
    ') % meta_tag + soup = BeautifulSoup(shift_jis_html) + if soup.originalEncoding != "shift-jis": + raise Exception("Test failed when parsing shift-jis document " + "with meta tag '%s'." + "If you're running Python >=2.4, or you have " + "cjkcodecs installed, this is a real problem. " + "Otherwise, ignore it." % meta_tag) + self.assertEquals(soup.originalEncoding, "shift-jis") + + content_type_tag = soup.meta['content'] + self.assertEquals(content_type_tag[content_type_tag.find('charset='):], + 'charset=%SOUP-ENCODING%') + content_type = str(soup.meta) + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=utf8')+1], + 'charset=utf-8') + content_type = soup.meta.encode('shift-jis') + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=shift-jis')], + 'charset=shift-jis'.encode()) + + self.assertEquals(soup.encode('utf-8'), ( + '\n' + '\n' + '' + '
    \n'
    +                '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
    +                '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
    +                '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
    +                '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
    +                '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
    +                '
    ')) + self.assertEquals(soup.encode("shift-jis"), + shift_jis_html.replace('x-sjis'.encode(), + 'shift-jis'.encode())) + + isolatin = """Sacr\xe9 bleu!""" + soup = BeautifulSoup(isolatin) + + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') + + def testHebrew(self): + iso_8859_8= '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n

    Hebrew (ISO 8859-8) in Visual Directionality

    \n\xed\xe5\xec\xf9\n\n' + utf8 = '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n

    Hebrew (ISO 8859-8) in Visual Directionality

    \n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' + soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") + self.assertEquals(soup.encode('utf-8'), utf8) + + def testSmartQuotesNotSoSmartAnymore(self): + self.assertSoupEquals("\x91Foo\x92 ", + '‘Foo’ ') + + def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): + smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" + soup = BeautifulSoup(smartQuotes) + self.assertEquals(soup.decode(), + 'Il a dit, ‹Sacré bleu!›') + soup = BeautifulSoup(smartQuotes, convertEntities="html") + self.assertEquals(soup.encode('utf-8'), + 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') + + def testDontSeeSmartQuotesWhereThereAreNone(self): + utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + self.assertSoupEquals(utf_8, encoding='utf-8') + + +class Whitewash(SoupTest): + """Test whitespace preservation.""" + + def testPreservedWhitespace(self): + self.assertSoupEquals("
       
    ") + self.assertSoupEquals("
     woo  
    ") + + def testCollapsedWhitespace(self): + self.assertSoupEquals("

    ", "

    ") + + +if __name__ == '__main__': + unittest.main() diff -r 3156760b4d26 -r 4cc66ab098e8 app/htmlsanitizer/HtmlSanitizer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/htmlsanitizer/HtmlSanitizer.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,575 @@ +# -*- coding: UTF-8 -*- +""" +some input filters, for regularising the html fragments from screen scraping and +browser-based editors into some semblance of sanity + +TODO: turn the messy setting[method_name]=True filter syntax into a list of cleaning methods to invoke, so that they can be invoked in a specific order and multiple times. + +AUTHORS: +Dan MacKinlay - https://launchpad.net/~dan-possumpalace +Collin Grady - http://launchpad.net/~collin-collingrady +Andreas Gustafsson - https://bugs.launchpad.net/~gson +HÃ¥kan W - https://launchpad.net/~hwaara-gmail +""" + +import BeautifulSoup +import re +import sys + +# Python 2.4 compatibility +try: any +except NameError: + def any(iterable): + for element in iterable: + if element: + return True + return False + +""" +html5lib compatibility. Basically, we need to know that this still works whether html5lib +is imported or not. Should run complete suites of tests for both possible configs - +or test in virtual environments, but for now a basic sanity check will do. +>>> if html5: +>>> c=Cleaner(html5=False) +>>> c(u'

    foo

    ) +u'

    foo

    ' +""" +try: + import html5lib + from html5lib import sanitizer, treebuilders + parser = html5lib.HTMLParser( + tree=treebuilders.getTreeBuilder("beautifulsoup"), + tokenizer=sanitizer.HTMLSanitizer + ) + html5 = True +except ImportError: + html5 = False + +ANTI_JS_RE=re.compile('j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:', re.IGNORECASE) +#These tags and attrs are sufficently liberal to let microformats through... +#it ruthlessly culls all the rdf, dublin core metadata and so on. +valid_tags = dict.fromkeys('p i em strong b u a h1 h2 h3 pre abbr br img dd dt ol ul li span sub sup ins del blockquote table tr td th address cite'.split()) #div? +valid_attrs = dict.fromkeys('href src rel title'.split()) +valid_schemes = dict.fromkeys('http https'.split()) +elem_map = {'b' : 'strong', 'i': 'em'} +attrs_considered_links = dict.fromkeys("src href".split()) #should include +#courtesy http://developer.mozilla.org/en/docs/HTML:Block-level_elements +block_elements = dict.fromkeys(["p", "h1","h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table"]) + +#convenient default filter lists. +paranoid_filters = ["strip_comments", "strip_tags", "strip_attrs", + "strip_schemes", "rename_tags", "wrap_string", "strip_empty_tags", "strip_empty_tags", ] +complete_filters = ["strip_comments", "rename_tags", "strip_tags", "strip_attrs", + "strip_cdata", "strip_schemes", "wrap_string", "strip_empty_tags", "rebase_links", "reparse"] + +#set some conservative default string processings +default_settings = { + "filters" : paranoid_filters, + "block_elements" : block_elements, #xml or None for a more liberal version + "convert_entities" : "html", #xml or None for a more liberal version + "valid_tags" : valid_tags, + "valid_attrs" : valid_attrs, + "valid_schemes" : valid_schemes, + "attrs_considered_links" : attrs_considered_links, + "elem_map" : elem_map, + "wrapping_element" : "p", + "auto_clean" : False, + "original_url" : "", + "new_url" : "", + "html5" : html5 +} +#processes I'd like but haven't implemented +#"encode_xml_specials", "ensure complete xhtml doc", "ensure_xhtml_fragment_only" +# and some handling of permitted namespaces for tags. for RDF, say. maybe. + +XML_ENTITIES = { u"'" : u"'", + u'"' : u""", + u"&" : u"&", + u"<" : u"<", + u">" : u">" + } +LINE_EXTRACTION_RE = re.compile(".+", re.MULTILINE) +BR_EXTRACTION_RE = re.compile("", re.MULTILINE) + +class Stop: + """ + handy class that we use as a stop input for our state machine in lieu of falling + off the end of lists + """ + pass + + +class Cleaner(object): + r""" + powerful and slow arbitrary HTML sanitisation. can deal (i hope) with most XSS + vectors and layout-breaking badness. + Probably overkill for content from trusted sources; defaults are accordingly + set to be paranoid. + >>> bad_html = '

    content>> good_html = u'

    content

    ' + >>> c = Cleaner() + >>> c.string = bad_html + >>> c.clean() + >>> c.string == good_html + True + + Also supports shorthand syntax: + >>> c = Cleaner() + >>> c(bad_html) == c(good_html) + True + """ + + def __init__(self, string_or_soup="", *args, **kwargs): + self.settings=default_settings.copy() + self.settings.update(kwargs) + if args : + self.settings['filters'] = args + super(Cleaner, self).__init__(string_or_soup, *args, **kwargs) + self.string = string_or_soup + + def __call__(self, string = None, **kwargs): + """ + convenience method allowing one-step calling of an instance and returning + a cleaned string. + + TODO: make this method preserve internal state- perhaps by creating a new + instance. + + >>> s = 'input string' + >>> c1 = Cleaner(s, auto_clean=True) + >>> c2 = Cleaner("") + >>> c1.string == c2(s) + True + + """ + self.settings.update(kwargs) + if not string == None : + self.string = string + self.clean() + return self.string + + def _set_contents(self, string_or_soup): + if isinstance(string_or_soup, BeautifulSoup.BeautifulSoup) : + self._set_soup(string_or_soup) + else : + self._set_string(string_or_soup) + + def _set_string(self, html_fragment_string): + if self.settings['html5']: + s = parser.parse(html_fragment_string).body + else: + s = BeautifulSoup.BeautifulSoup( + html_fragment_string, + convertEntities=self.settings['convert_entities']) + self._set_soup(s) + + def _set_soup(self, soup): + """ + Does all the work of set_string, but bypasses a potential autoclean to avoid + loops upon internal string setting ops. + """ + self._soup = BeautifulSoup.BeautifulSoup( + '' + ) + self.root=self._soup.contents[0] + + if len(soup.contents) : + backwards_soup = [i for i in soup.contents] + backwards_soup.reverse() + else : + backwards_soup = [] + for i in backwards_soup : + i.extract() + self.root.insert(0, i) + + def set_string(self, string) : + ur""" + sets the string to process and does the necessary input encoding too + really intended to be invoked as a property. + note the godawful rootrootroot element which we need because the + BeautifulSoup object has all the same methods as a Tag, but + behaves differently, silently failing on some inserts and appends + + >>> c = Cleaner(convert_entities="html") + >>> c.string = 'é' + >>> c.string + u'\xe9' + >>> c = Cleaner(convert_entities="xml") + >>> c.string = u'é' + >>> c.string + u'é' + """ + self._set_string(string) + if len(string) and self.settings['auto_clean'] : self.clean() + + def get_string(self): + return unicode(self.root.renderContents()) + + string = property(get_string, set_string) + + def clean(self): + """ + invoke all cleaning processes stipulated in the settings + """ + for method in self.settings['filters'] : + try : + getattr(self, method)() + except NotImplementedError : + sys.stderr.write('Warning, called unimplemented method %s' % method + '\n') + + def strip_comments(self): + r""" + XHTML comments are used as an XSS attack vector. they must die. + + >>> c = Cleaner("", "strip_comments") + >>> c('

    text More text

    ') + u'

    text More text

    ' + """ + for comment in self.root.findAll( + text = lambda text: isinstance(text, BeautifulSoup.Comment)): + comment.extract() + + def strip_cdata(self): + for cdata in self.root.findAll( + text = lambda text: isinstance(text, BeautifulSoup.CData)): + cdata.extract() + + def strip_tags(self): + r""" + ill-considered tags break our layout. they must die. + >>> c = Cleaner("", "strip_tags", auto_clean=True) + >>> c.string = '
    A B C
    ' + >>> c.string + u'A B C' + >>> c.string = '
    A
    B C
    ' + >>> c.string + u'A B C' + >>> c.string = '
    A
    B C
    ' + >>> c.string + u'A
    B C' + >>> c.string = '

    A

    B C

    ' + >>> c.string + u'

    A B C

    ' + >>> c.string = 'A
    B
    C
    D
    E
    F
    G' + >>> c.string + u'ABCDEFG' + >>> c.string = '
    B
    C
    D
    E
    F
    ' + >>> c.string + u'BCDEF' + """ + # Beautiful Soup doesn't support dynamic .findAll results when the tree is + # modified in place. + # going backwards doesn't seem to help. + # so find one at a time + while True : + next_bad_tag = self.root.find( + lambda tag : not tag.name in (self.settings['valid_tags']) + ) + if next_bad_tag : + self.disgorge_elem(next_bad_tag) + else: + break + + def strip_attrs(self): + """ + preserve only those attributes we need in the soup + >>> c = Cleaner("", "strip_attrs") + >>> c('
    A B C
    ') + u'
    A B C
    ' + """ + for tag in self.root.findAll(True): + tag.attrs = [(attr, val) for attr, val in tag.attrs + if attr in self.settings['valid_attrs']] + + def _all_links(self): + """ + finds all tags with link attributes sequentially. safe against modification + of said attributes in-place. + """ + start = self.root + while True: + tag = start.findNext( + lambda tag : any( + [(tag.get(i) for i in self.settings['attrs_considered_links'])] + )) + if tag: + start = tag + yield tag + else : + break + + def strip_schemes(self): + """ + >>> c = Cleaner("", "strip_schemes") + >>> c('') + u'' + >>> c('foo') + u'foo' + """ + for tag in self._all_links() : + for key in self.settings['attrs_considered_links'] : + scheme_bits = tag.get(key, u"").split(u':',1) + if len(scheme_bits) == 1 : + pass #relative link + else: + if not scheme_bits[0] in self.settings['valid_schemes'] : + del(tag[key]) + + def br_to_p(self): + """ + >>> c = Cleaner("", "br_to_p") + >>> c('

    A
    B

    ') + u'

    A

    B

    ' + >>> c('A
    B') + u'

    A

    B

    ' + """ + block_elems = self.settings['block_elements'] + block_elems['br'] = None + block_elems['p'] = None + + while True : + next_br = self.root.find('br') + if not next_br: break + parent = next_br.parent + self.wrap_string('p', start_at=parent, block_elems = block_elems) + while True: + useless_br=parent.find('br', recursive=False) + if not useless_br: break + useless_br.extract() + if parent.name == 'p': + self.disgorge_elem(parent) + + def rename_tags(self): + """ + >>> c = Cleaner("", "rename_tags", elem_map={'i': 'em'}) + >>> c('AB') + u'AB' + """ + for tag in self.root.findAll(self.settings['elem_map']) : + tag.name = self.settings['elem_map'][tag.name] + + def wrap_string(self, wrapping_element = None, start_at=None, block_elems=None): + """ + takes an html fragment, which may or may not have a single containing element, + and guarantees what the tag name of the topmost elements are. + TODO: is there some simpler way than a state machine to do this simple thing? + >>> c = Cleaner("", "wrap_string") + >>> c('A B CD') + u'

    A B CD

    ' + >>> c('A

    B C

    D') + u'

    A

    B C

    D

    ' + """ + if not start_at : start_at = self.root + if not block_elems : block_elems = self.settings['block_elements'] + e = (wrapping_element or self.settings['wrapping_element']) + paragraph_list = [] + children = [elem for elem in start_at.contents] + children.append(Stop()) + + last_state = 'block' + paragraph = BeautifulSoup.Tag(self._soup, e) + + for node in children : + if isinstance(node, Stop) : + state = 'end' + elif hasattr(node, 'name') and node.name in block_elems: + state = 'block' + else: + state = 'inline' + + if last_state == 'block' and state == 'inline': + #collate inline elements + paragraph = BeautifulSoup.Tag(self._soup, e) + + if state == 'inline' : + paragraph.append(node) + + if ((state <> 'inline') and last_state == 'inline') : + paragraph_list.append(paragraph) + + if state == 'block' : + paragraph_list.append(node) + + last_state = state + + #can't use append since it doesn't work on empty elements... + paragraph_list.reverse() + for paragraph in paragraph_list: + start_at.insert(0, paragraph) + + def strip_empty_tags(self): + """ + strip out all empty tags + TODO: depth-first search + >>> c = Cleaner("", "strip_empty_tags") + >>> c('

    A

    B

    ') + u'

    A

    B

    ' + >>> c('

    ') + u'

    ' + """ + tag = self.root + while True: + next_tag = tag.findNext(True) + if not next_tag: break + if next_tag.contents or next_tag.attrs: + tag = next_tag + continue + next_tag.extract() + + def rebase_links(self, original_url="", new_url ="") : + if not original_url : original_url = self.settings.get('original_url', '') + if not new_url : new_url = self.settings.get('new_url', '') + raise NotImplementedError + + # Because of its internal character set handling, + # the following will not work in Beautiful soup and is hopefully redundant. + # def encode_xml_specials(self, original_url="", new_url ="") : + # """ + # BeautifulSoup will let some dangerous xml entities hang around + # in the navigable strings. destroy all monsters. + # >>> c = Cleaner(auto_clean=True, encode_xml_specials=True) + # >>> c('<<<<<') + # u'<<<<' + # """ + # for string in self.root.findAll(text=True) : + # sys.stderr.write("root" +"\n") + # sys.stderr.write(str(self.root) +"\n") + # sys.stderr.write("parent" +"\n") + # sys.stderr.write(str(string.parent) +"\n") + # new_string = unicode(string) + # sys.stderr.write(string +"\n") + # for special_char in XML_ENTITIES.keys() : + # sys.stderr.write(special_char +"\n") + # string.replaceWith( + # new_string.replace(special_char, XML_ENTITIES[special_char]) + # ) + + + def disgorge_elem(self, elem): + """ + remove the given element from the soup and replaces it with its own contents + actually tricky, since you can't replace an element with an list of elements + using replaceWith + >>> disgorgeable_string = 'A B C' + >>> c = Cleaner() + >>> c.string = disgorgeable_string + >>> elem = c._soup.find('em') + >>> c.disgorge_elem(elem) + >>> c.string + u'A B C' + >>> c.string = disgorgeable_string + >>> elem = c._soup.find('body') + >>> c.disgorge_elem(elem) + >>> c.string + u'A B C' + >>> c.string = '
    A
    B C
    ' + >>> elem = c._soup.find(id="inner") + >>> c.disgorge_elem(elem) + >>> c.string + u'
    A B C
    ' + """ + if elem == self.root : + raise AttributeError, "Can't disgorge root" + + # With in-place modification, BeautifulSoup occasionally can return + # elements that think they are orphans + # this lib is full of workarounds, but it's worth checking + parent = elem.parent + if parent == None: + raise AttributeError, "AAAAAAAAGH! NO PARENTS! DEATH!" + + i = None + for i in range(len(parent.contents)) : + if parent.contents[i] == elem : + index = i + break + + elem.extract() + + #the proceeding method breaks horribly, sporadically. + # for i in range(len(elem.contents)) : + # elem.contents[i].extract() + # parent.contents.insert(index+i, elem.contents[i]) + # return + self._safe_inject(parent, index, elem.contents) + + def _safe_inject(self, dest, dest_index, node_list): + #BeautifulSoup result sets look like lists but don't behave right + # i.e. empty ones are still True, + if not len(node_list) : return + node_list = [i for i in node_list] + node_list.reverse() + for i in node_list : + dest.insert(dest_index, i) + + +class Htmlator(object) : + """ + converts a string into a series of html paragraphs + """ + settings = { + "encode_xml_specials" : True, + "is_plaintext" : True, + "convert_newlines" : False, + "make_links" : True, + "auto_convert" : False, + "valid_schemes" : valid_schemes, + } + def __init__(self, string = "", **kwargs): + self.settings.update(kwargs) + super(Htmlator, self).__init__(string, **kwargs) + self.string = string + + def _set_string(self, string): + self.string = string + if self.settings['auto_convert'] : self.convert() + + def _get_string(self): + return unicode(self._soup) + + string = property(_get_string, _set_string) + + def __call__(self, string): + """ + convenience method supporting one-step calling of an instance + as a string cleaning function + """ + self.string = string + self.convert() + return self.string + + def convert(self): + for method in ["encode_xml_specials", "convert_newlines", + "make_links"] : + if self.settings(method) : + getattr(self, method)() + + def encode_xml_specials(self) : + for char in XML_ENTITIES.keys() : + self.string.replace(char, XML_ENTITIES[char]) + + def make_links(self): + raise NotImplementedError + + def convert_newlines(self) : + self.string = ''.join([ + '

    ' + line + '

    ' for line in LINE_EXTRACTION_RE.findall(self.string) + ]) + +def _test(): + import doctest + doctest.testmod() + +if __name__ == "__main__": + _test() + + +# def cast_input_to_soup(fn): +# """ +# Decorate function to handle strings as BeautifulSoups transparently +# """ +# def stringy_version(input, *args, **kwargs) : +# if not isinstance(input,BeautifulSoup) : +# input=BeautifulSoup(input) +# return fn(input, *args, **kwargs) +# return stringy_version diff -r 3156760b4d26 -r 4cc66ab098e8 app/htmlsanitizer/LICENSE-BeautifulSoup --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/htmlsanitizer/LICENSE-BeautifulSoup Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,32 @@ +Copyright (c) 2004-2009, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the the Beautiful Soup Consortium and All + Night Kosher Bakery nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. diff -r 3156760b4d26 -r 4cc66ab098e8 app/htmlsanitizer/LICENSE-HtmlSanitizer --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/htmlsanitizer/LICENSE-HtmlSanitizer Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,23 @@ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the distribution. + + 3. The names of the authors may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JCRAFT, +INC. OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff -r 3156760b4d26 -r 4cc66ab098e8 app/htmlsanitizer/__init__.py diff -r 3156760b4d26 -r 4cc66ab098e8 app/main.py --- a/app/main.py Mon May 25 23:42:15 2009 +0200 +++ b/app/main.py Tue May 26 02:37:39 2009 +0200 @@ -29,42 +29,7 @@ from google.appengine.ext.webapp import util - -# Remove the standard version of Django. -for k in [k for k in sys.modules if k.startswith('django')]: - del sys.modules[k] - -# Force sys.path to have our own directory first, in case we want to import -# from it. This lets us replace the built-in Django -sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) - -sys.path.insert(0, os.path.abspath('django.zip')) - -ultimate_sys_path = None - -# Force Django to reload its settings. -from django.conf import settings -settings._target = None - -# Must set this env var before importing any part of Django -os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' - -import django.core.handlers.wsgi -import django.core.signals -import django.db - -# Log errors. -def log_exception(*args, **kwds): - """Function used for logging exceptions. - """ - logging.exception('Exception in request:') - -# Log all exceptions detected by Django. -django.core.signals.got_request_exception.connect(log_exception) - -# Unregister the rollback event handler. -django.core.signals.got_request_exception.disconnect( - django.db._rollback_on_exception) +import gae_django def profile_main_as_html(): @@ -117,15 +82,17 @@ def real_main(): """Main program without profiling. """ - global ultimate_sys_path - if ultimate_sys_path is None: - ultimate_sys_path = list(sys.path) - else: - sys.path[:] = ultimate_sys_path + import django.core.handlers.wsgi # Create a Django application for WSGI. application = django.core.handlers.wsgi.WSGIHandler() + from soc.modules import callback + from soc.modules import core + + callback.registerCore(core.Core()) + callback.getCore().registerModuleCallbacks() + # Run the WSGI CGI handler with that application. util.run_wsgi_app(application) diff -r 3156760b4d26 -r 4cc66ab098e8 app/settings.py --- a/app/settings.py Mon May 25 23:42:15 2009 +0200 +++ b/app/settings.py Tue May 26 02:37:39 2009 +0200 @@ -100,6 +100,7 @@ os.path.join(ROOT_PATH, 'ghop', 'templates'), os.path.join(ROOT_PATH, 'gsoc', 'templates'), os.path.join(ROOT_PATH, 'soc', 'templates'), + os.path.join(ROOT_PATH, 'shell', 'templates'), ) INSTALLED_APPS = ( @@ -109,3 +110,6 @@ # 'django.contrib.sessions', # 'django.contrib.sites', ) + +MODULE_FMT = 'soc.modules.%s' +MODULES = [] diff -r 3156760b4d26 -r 4cc66ab098e8 app/shell/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/shell/README Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,17 @@ +An interactive, stateful AJAX shell that runs Python code on the server. + +Part of http://code.google.com/p/google-app-engine-samples/. + +May be run as a standalone app or in an existing app as an admin-only handler. +Can be used for system administration tasks, as an interactive way to try out +APIs, or as a debugging aid during development. + +The logging, os, sys, db, and users modules are imported automatically. + +Interpreter state is stored in the datastore so that variables, function +definitions, and other values in the global and local namespaces can be used +across commands. + +To use the shell in your app, copy shell.py, static/*, and templates/* into +your app's source directory. Then, copy the URL handlers from app.yaml into +your app.yaml. diff -r 3156760b4d26 -r 4cc66ab098e8 app/shell/__init__.py diff -r 3156760b4d26 -r 4cc66ab098e8 app/shell/shell.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/shell/shell.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,317 @@ +#!/usr/bin/python +# +# Copyright 2007 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +An interactive, stateful AJAX shell that runs Python code on the server. + +Part of http://code.google.com/p/google-app-engine-samples/. + +May be run as a standalone app or in an existing app as an admin-only handler. +Can be used for system administration tasks, as an interactive way to try out +APIs, or as a debugging aid during development. + +The logging, os, sys, db, and users modules are imported automatically. + +Interpreter state is stored in the datastore so that variables, function +definitions, and other values in the global and local namespaces can be used +across commands. + +To use the shell in your app, copy shell.py, static/*, and templates/* into +your app's source directory. Then, copy the URL handlers from app.yaml into +your app.yaml. + +TODO: unit tests! +""" + +import logging +import new +import os +import pickle +import sys +import traceback +import types +import wsgiref.handlers + +from django.template import loader +from google.appengine.api import users +from google.appengine.ext import db +from google.appengine.ext import webapp +from google.appengine.ext.webapp import template + +import django.template +import gae_django + + +# Set to True if stack traces should be shown in the browser, etc. +_DEBUG = True + +# The entity kind for shell sessions. Feel free to rename to suit your app. +_SESSION_KIND = '_Shell_Session' + +# Types that can't be pickled. +UNPICKLABLE_TYPES = ( + types.ModuleType, + types.TypeType, + types.ClassType, + types.FunctionType, + ) + +# Unpicklable statements to seed new sessions with. +INITIAL_UNPICKLABLES = [ + 'import logging', + 'import os', + 'import sys', + 'from google.appengine.ext import db', + 'from google.appengine.api import users', + ] + + +class ShellSession(db.Model): + """A shell session. Stores the session's globals. + + Each session globals is stored in one of two places: + + If the global is picklable, it's stored in the parallel globals and + global_names list properties. (They're parallel lists to work around the + unfortunate fact that the datastore can't store dictionaries natively.) + + If the global is not picklable (e.g. modules, classes, and functions), or if + it was created by the same statement that created an unpicklable global, + it's not stored directly. Instead, the statement is stored in the + unpicklables list property. On each request, before executing the current + statement, the unpicklable statements are evaluated to recreate the + unpicklable globals. + + The unpicklable_names property stores all of the names of globals that were + added by unpicklable statements. When we pickle and store the globals after + executing a statement, we skip the ones in unpicklable_names. + + Using Text instead of string is an optimization. We don't query on any of + these properties, so they don't need to be indexed. + """ + global_names = db.ListProperty(db.Text) + globals = db.ListProperty(db.Blob) + unpicklable_names = db.ListProperty(db.Text) + unpicklables = db.ListProperty(db.Text) + + def set_global(self, name, value): + """Adds a global, or updates it if it already exists. + + Also removes the global from the list of unpicklable names. + + Args: + name: the name of the global to remove + value: any picklable value + """ + blob = db.Blob(pickle.dumps(value)) + + if name in self.global_names: + index = self.global_names.index(name) + self.globals[index] = blob + else: + self.global_names.append(db.Text(name)) + self.globals.append(blob) + + self.remove_unpicklable_name(name) + + def remove_global(self, name): + """Removes a global, if it exists. + + Args: + name: string, the name of the global to remove + """ + if name in self.global_names: + index = self.global_names.index(name) + del self.global_names[index] + del self.globals[index] + + def globals_dict(self): + """Returns a dictionary view of the globals. + """ + return dict((name, pickle.loads(val)) + for name, val in zip(self.global_names, self.globals)) + + def add_unpicklable(self, statement, names): + """Adds a statement and list of names to the unpicklables. + + Also removes the names from the globals. + + Args: + statement: string, the statement that created new unpicklable global(s). + names: list of strings; the names of the globals created by the statement. + """ + self.unpicklables.append(db.Text(statement)) + + for name in names: + self.remove_global(name) + if name not in self.unpicklable_names: + self.unpicklable_names.append(db.Text(name)) + + def remove_unpicklable_name(self, name): + """Removes a name from the list of unpicklable names, if it exists. + + Args: + name: string, the name of the unpicklable global to remove + """ + if name in self.unpicklable_names: + self.unpicklable_names.remove(name) + + +class FrontPageHandler(webapp.RequestHandler): + """Creates a new session and renders the shell.html template. + """ + + def get(self): + # set up the session. TODO: garbage collect old shell sessions + session_key = self.request.get('session') + if session_key: + session = ShellSession.get(session_key) + else: + # create a new session + session = ShellSession() + session.unpicklables = [db.Text(line) for line in INITIAL_UNPICKLABLES] + session_key = session.put() + + template_file = os.path.join(os.path.dirname(__file__), 'templates', + 'shell.html') + session_url = '/?session=%s' % session_key + vars = { 'server_software': os.environ['SERVER_SOFTWARE'], + 'python_version': sys.version, + 'session': str(session_key), + 'user': users.get_current_user(), + 'login_url': users.create_login_url(session_url), + 'logout_url': users.create_logout_url(session_url), + } + + rendered = loader.render_to_string('shell.html', dictionary=vars) + # rendered = webapp.template.render(template_file, vars, debug=_DEBUG) + self.response.out.write(rendered) + + +class StatementHandler(webapp.RequestHandler): + """Evaluates a python statement in a given session and returns the result. + """ + + def get(self): + self.response.headers['Content-Type'] = 'text/plain' + + # extract the statement to be run + statement = self.request.get('statement') + if not statement: + return + + # the python compiler doesn't like network line endings + statement = statement.replace('\r\n', '\n') + + # add a couple newlines at the end of the statement. this makes + # single-line expressions such as 'class Foo: pass' evaluate happily. + statement += '\n\n' + + # log and compile the statement up front + try: + logging.info('Compiling and evaluating:\n%s' % statement) + compiled = compile(statement, '', 'single') + except: + self.response.out.write(traceback.format_exc()) + return + + # create a dedicated module to be used as this statement's __main__ + statement_module = new.module('__main__') + + # use this request's __builtin__, since it changes on each request. + # this is needed for import statements, among other things. + import __builtin__ + statement_module.__builtins__ = __builtin__ + + # load the session from the datastore + session = ShellSession.get(self.request.get('session')) + + # swap in our custom module for __main__. then unpickle the session + # globals, run the statement, and re-pickle the session globals, all + # inside it. + old_main = sys.modules.get('__main__') + try: + sys.modules['__main__'] = statement_module + statement_module.__name__ = '__main__' + + # re-evaluate the unpicklables + for code in session.unpicklables: + exec code in statement_module.__dict__ + + # re-initialize the globals + for name, val in session.globals_dict().items(): + try: + statement_module.__dict__[name] = val + except: + msg = 'Dropping %s since it could not be unpickled.\n' % name + self.response.out.write(msg) + logging.warning(msg + traceback.format_exc()) + session.remove_global(name) + + # run! + old_globals = dict(statement_module.__dict__) + try: + old_stdout = sys.stdout + old_stderr = sys.stderr + try: + sys.stdout = self.response.out + sys.stderr = self.response.out + exec compiled in statement_module.__dict__ + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + except: + self.response.out.write(traceback.format_exc()) + return + + # extract the new globals that this statement added + new_globals = {} + for name, val in statement_module.__dict__.items(): + if name not in old_globals or val != old_globals[name]: + new_globals[name] = val + + if True in [isinstance(val, UNPICKLABLE_TYPES) + for val in new_globals.values()]: + # this statement added an unpicklable global. store the statement and + # the names of all of the globals it added in the unpicklables. + session.add_unpicklable(statement, new_globals.keys()) + logging.debug('Storing this statement as an unpicklable.') + + else: + # this statement didn't add any unpicklables. pickle and store the + # new globals back into the datastore. + for name, val in new_globals.items(): + if not name.startswith('__'): + session.set_global(name, val) + + finally: + sys.modules['__main__'] = old_main + + session.put() + + +def main(): + """Main program. + """ + + application = webapp.WSGIApplication( + [('/admin/shell', FrontPageHandler), + ('/admin/shell/shell.do', StatementHandler)], debug=_DEBUG) + wsgiref.handlers.CGIHandler().run(application) + + +if __name__ == '__main__': + main() diff -r 3156760b4d26 -r 4cc66ab098e8 app/shell/static/shell.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/shell/static/shell.js Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,195 @@ +// Copyright 2007 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @fileoverview + * Javascript code for the interactive AJAX shell. + * + * Part of http://code.google.com/p/google-app-engine-samples/. + * + * Includes a function (shell.runStatement) that sends the current python + * statement in the shell prompt text box to the server, and a callback + * (shell.done) that displays the results when the XmlHttpRequest returns. + * + * Also includes cross-browser code (shell.getXmlHttpRequest) to get an + * XmlHttpRequest. + */ + +/** + * Shell namespace. + * @type {Object} + */ +var shell = {} + +/** + * The shell history. history is an array of strings, ordered oldest to + * newest. historyCursor is the current history element that the user is on. + * + * The last history element is the statement that the user is currently + * typing. When a statement is run, it's frozen in the history, a new history + * element is added to the end of the array for the new statement, and + * historyCursor is updated to point to the new element. + * + * @type {Array} + */ +shell.history = ['']; + +/** + * See {shell.history} + * @type {number} + */ +shell.historyCursor = 0; + +/** + * A constant for the XmlHttpRequest 'done' state. + * @type Number + */ +shell.DONE_STATE = 4; + +/** + * A cross-browser function to get an XmlHttpRequest object. + * + * @return {XmlHttpRequest?} a new XmlHttpRequest + */ +shell.getXmlHttpRequest = function() { + if (window.XMLHttpRequest) { + return new XMLHttpRequest(); + } else if (window.ActiveXObject) { + try { + return new ActiveXObject('Msxml2.XMLHTTP'); + } catch(e) { + return new ActiveXObject('Microsoft.XMLHTTP'); + } + } + + return null; +}; + +/** + * This is the prompt textarea's onkeypress handler. Depending on the key that + * was pressed, it will run the statement, navigate the history, or update the + * current statement in the history. + * + * @param {Event} event the keypress event + * @return {Boolean} false to tell the browser not to submit the form. + */ +shell.onPromptKeyPress = function(event) { + var statement = document.getElementById('statement'); + + if (this.historyCursor == this.history.length - 1) { + // we're on the current statement. update it in the history before doing + // anything. + this.history[this.historyCursor] = statement.value; + } + + // should we pull something from the history? + if (event.shiftKey && event.keyCode == 38 /* up arrow */) { + if (this.historyCursor > 0) { + statement.value = this.history[--this.historyCursor]; + } + return false; + } else if (event.shiftKey && event.keyCode == 40 /* down arrow */) { + if (this.historyCursor < this.history.length - 1) { + statement.value = this.history[++this.historyCursor]; + } + return false; + } else if (!event.altKey) { + // probably changing the statement. update it in the history. + this.historyCursor = this.history.length - 1; + this.history[this.historyCursor] = statement.value; + } + + // should we submit? + var ctrlEnter = (document.getElementById('submit_key').value == 'ctrl-enter'); + if (event.keyCode == 13 /* enter */ && !event.altKey && !event.shiftKey && + event.ctrlKey == ctrlEnter) { + return this.runStatement(); + } +}; + +/** + * The XmlHttpRequest callback. If the request succeeds, it adds the command + * and its resulting output to the shell history div. + * + * @param {XmlHttpRequest} req the XmlHttpRequest we used to send the current + * statement to the server + */ +shell.done = function(req) { + if (req.readyState == this.DONE_STATE) { + var statement = document.getElementById('statement') + statement.className = 'prompt'; + + // add the command to the shell output + var output = document.getElementById('output'); + + output.value += '\n>>> ' + statement.value; + statement.value = ''; + + // add a new history element + this.history.push(''); + this.historyCursor = this.history.length - 1; + + // add the command's result + var result = req.responseText.replace(/^\s*|\s*$/g, ''); // trim whitespace + if (result != '') + output.value += '\n' + result; + + // scroll to the bottom + output.scrollTop = output.scrollHeight; + if (output.createTextRange) { + var range = output.createTextRange(); + range.collapse(false); + range.select(); + } + } +}; + +/** + * This is the form's onsubmit handler. It sends the python statement to the + * server, and registers shell.done() as the callback to run when it returns. + * + * @return {Boolean} false to tell the browser not to submit the form. + */ +shell.runStatement = function() { + var form = document.getElementById('form'); + + // build a XmlHttpRequest + var req = this.getXmlHttpRequest(); + if (!req) { + document.getElementById('ajax-status').innerHTML = + "Your browser doesn't support AJAX. :("; + return false; + } + + req.onreadystatechange = function() { shell.done(req); }; + + // build the query parameter string + var params = ''; + for (i = 0; i < form.elements.length; i++) { + var elem = form.elements[i]; + if (elem.type != 'submit' && elem.type != 'button' && elem.id != 'caret') { + var value = escape(elem.value).replace(/\+/g, '%2B'); // escape ignores + + params += '&' + elem.name + '=' + value; + } + } + + // send the request and tell the user. + document.getElementById('statement').className = 'prompt processing'; + req.open(form.method, form.action + '?' + params, true); + req.setRequestHeader('Content-type', + 'application/x-www-form-urlencoded;charset=UTF-8'); + req.send(null); + + return false; +}; diff -r 3156760b4d26 -r 4cc66ab098e8 app/shell/static/spinner.gif Binary file app/shell/static/spinner.gif has changed diff -r 3156760b4d26 -r 4cc66ab098e8 app/shell/templates/shell.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/shell/templates/shell.html Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,124 @@ + + + + + Interactive Shell + + + + + + +

    Interactive server-side Python shell + (original source) +

    +

    + Return to main home +

    + + + +
    + + + + + + + + +

    + +

    +{% if user %} + {{ user.nickname }} + (log out) +{% else %} + log in +{% endif %} + | Shift-Up/Down for history | + + +

    + + + + + + diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/content/css/ui.datetimepicker-090304.css diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/cron/job.py --- a/app/soc/cron/job.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/cron/job.py Tue May 26 02:37:39 2009 +0200 @@ -29,8 +29,10 @@ from google.appengine.runtime import DeadlineExceededError from soc.cron import student_proposal_mailer +from soc.cron import unique_user_id_adder from soc.models.job import Job + class Error(Exception): """Base class for all exceptions raised by this module. """ @@ -67,6 +69,10 @@ student_proposal_mailer.setupStudentProposalMailing self.tasks['sendStudentProposalMail'] = \ student_proposal_mailer.sendStudentProposalMail + self.tasks['setupUniqueUserIdAdder'] = \ + unique_user_id_adder.setupUniqueUserIdAdder + self.tasks['addUniqueUserIds'] = \ + unique_user_id_adder.addUniqueUserIds def claimJob(self, job_key): """A transaction to claim a job. diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/cron/unique_user_id_adder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/soc/cron/unique_user_id_adder.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,135 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cron job handler for adding unique user id. +""" + +__authors__ = [ + '"Pawel Solyga" ', + ] + + +from google.appengine.ext import db +from google.appengine.api import users +from soc.logic.models.job import logic as job_logic +from soc.logic.models.priority_group import logic as priority_logic +from soc.logic.models.user import logic as user_logic +from soc.models.user import User + + +# amount of users to create jobs for before updating +DEF_USER_STEP_SIZE = 10 + + +class TempUserWithUniqueId(db.Model): + """Helper model for temporary storing User Property with unique id. + """ + user = db.UserProperty(required=True) + + +def emailToAccountAndUserId(address): + """Return a stable user_id string based on an email address, or None if + the address is not a valid/existing google account. + """ + user = users.User(address) + key = TempUserWithUniqueId(user=user).put() + obj = TempUserWithUniqueId.get(key) + return (obj, obj.user.user_id()) + + +def setupUniqueUserIdAdder(job_entity): + """Job that setup jobs that will add unique user ids to all Users. + + Args: + job_entity: a Job entity with key_data set to + [last_completed_user] + """ + + from soc.cron.job import FatalJobError + + user_fields = {'user_id': None} + + if len(key_data) == 1: + # start where we left off + user_fields['__key__ >'] = key_data[0] + + m_users = user_logic.getForFields(user_fields, + limit=DEF_USER_STEP_SIZE) + + # set the default fields for the jobs we are going to create + priority_group = priority_logic.getGroup(priority_logic.CONVERT) + job_fields = { + 'priority_group': priority_group, + 'task_name': 'addUniqueUserIds'} + + job_query_fields = job_fields.copy() + + while m_users: + # for each user create a adder job + for user in m_users: + + job_query_fields['key_data'] = user.key() + adder_job = job_logic.getForFields(job_query_fields, unique=True) + + if not adder_job: + # this user doesn't have unique id yet + job_fields['key_data'] = [user.key()] + job_logic.updateOrCreateFromFields(job_fields) + + # update our own job + last_user_key = m_users[-1].key() + + if len(key_data) == 1: + key_data[0] = last_student_key + else: + key_data.append(last_student_key) + + updated_job_fields = {'key_data': key_data} + job_logic.updateEntityProperties(job_entity, updated_job_fields) + + # rinse and repeat + user_fields['__key__ >'] = last_user_key + m_users = student_logic.getForFields(user_fields, + limit=DEF_USER_STEP_SIZE) + + # we are finished + return + + +def addUniqueUserIds(job_entity): + """Job that will add unique user id to a User. + + Args: + job_entity: a Job entity with key_data set to [user_key] + """ + + from soc.cron.job import FatalJobError + + user_keyname = job_entity.key_data[0].name() + user_entity = user_logic.getFromKeyName(user_keyname) + + if not user_entity: + raise FatalJobError('The User with keyname %s does not exist!' % ( + user_keyname)) + + # add unique user id + account, user_id = emailToAccountAndUserId(user_entity.account.email()) + user_entity.account = account + user_entity.user_id = user_id + user_entity.put() + + # we are done here + return \ No newline at end of file diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/logic/accounts.py --- a/app/soc/logic/accounts.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/logic/accounts.py Tue May 26 02:37:39 2009 +0200 @@ -35,6 +35,13 @@ return normalizeAccount(account) if (account and normalize) else account +def getCurrentUserId(): + """Returns a unique id of the current user. + """ + + return users.get_current_user().user_id() + + def normalizeAccount(account): """Returns a normalized version of the specified account. """ @@ -46,6 +53,7 @@ return users.User(email=normalized) + def denormalizeAccount(account): """Returns a denormalized version of the specified account. """ @@ -58,6 +66,7 @@ return users.User(email=denormalized) + def isDeveloper(account=None): """Returns True if a Google Account is a Developer with special privileges. diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/logic/cleaning.py --- a/app/soc/logic/cleaning.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/logic/cleaning.py Tue May 26 02:37:39 2009 +0200 @@ -21,10 +21,11 @@ '"Todd Larsen" ', '"Sverre Rabbelier" ', '"Lennard de Rijk" ', + '"Pawel Solyga" ', ] -import feedparser +from htmlsanitizer import HtmlSanitizer from google.appengine.api import users @@ -379,16 +380,25 @@ def wrapped(self): """Decorator wrapper method. """ + from HTMLParser import HTMLParseError content = self.cleaned_data.get(field_name) + # clean_html_content is called when writing data into GAE rather than + # when reading data from GAE. This short-circuiting of the sanitizer + # only affects html authored by developers. The isDeveloper test for + # example allows developers to add javascript. if user_logic.isDeveloper(): return content - - sanitizer = feedparser._HTMLSanitizer('utf-8') - sanitizer.feed(content) - content = sanitizer.output() - content = content.decode('utf-8') + + try: + cleaner = HtmlSanitizer.Cleaner() + cleaner.string = content + cleaner.clean() + except HTMLParseError, msg: + raise forms.ValidationError(msg) + + content = cleaner.string content = content.strip().replace('\r\n', '\n') return content diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/logic/helper/notifications.py --- a/app/soc/logic/helper/notifications.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/logic/helper/notifications.py Tue May 26 02:37:39 2009 +0200 @@ -191,6 +191,7 @@ 'scope_path': to_user.link_id } + import soc.logic.models.notification key_name = model_logic.notification.logic.getKeyNameFromFields(fields) # create and put a new notification in the datastore @@ -204,6 +205,8 @@ notification_entity: Notification about which the message should be sent """ + import soc.views.models.notification + # create the url to show this notification notification_url = "http://%(host)s%(index)s" % { 'host' : os.environ['HTTP_HOST'], diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/logic/models/base.py --- a/app/soc/logic/models/base.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/logic/models/base.py Tue May 26 02:37:39 2009 +0200 @@ -25,6 +25,8 @@ ] +import logging + from google.appengine.ext import db from django.utils.translation import ugettext @@ -324,7 +326,13 @@ query = self.getQueryForFields(filter=filter, order=order) - result = query.fetch(limit, offset) + try: + result = query.fetch(limit, offset) + except db.NeedIndexError, exception: + result = [] + logging.exception("%s, model: %s filter: %s, order: %s" % + (exception, self._model, filter, order)) + # TODO: send email if unique: return result[0] if result else None diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/logic/models/user.py --- a/app/soc/logic/models/user.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/logic/models/user.py Tue May 26 02:37:39 2009 +0200 @@ -75,6 +75,20 @@ return self.getForAccount(account) + def getForCurrentUserId(self): + """Retrieves the user entity for the currently logged in user id. + + If there is no user logged in, or they have no valid associated User + entity, None is returned. + """ + + user_id = accounts.getCurrentUserId() + + if not user_id: + return None + + return self.getForUserId(user_id) + def getForAccount(self, account): """Retrieves the user entity for the specified account. @@ -94,6 +108,23 @@ return self.getForFields(filter=fields, unique=True) + def getForUserId(self, user_id): + """Retrieves the user entity for the specified user id. + + If there is no user logged in, or they have no valid associated User + entity, None is returned. + """ + + if not user_id: + raise base.InvalidArgumentError + + fields = { + 'user_id': user_id, + 'status':'valid', + } + + return self.getForFields(filter=fields, unique=True) + def isDeveloper(self, account=None, user=None): """Returns true iff the specified user is a Developer. diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/models/seed_db.py --- a/app/soc/models/seed_db.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/models/seed_db.py Tue May 26 02:37:39 2009 +0200 @@ -139,7 +139,7 @@ def seed(self, i, entities=None, current_user=None, gsoc2009=None): properties = { - 'key_name': 'google/gsoc2009/%04d' % i, + 'key_name': 'google/gsoc2009/org_%04d' % i, 'link_id': 'org_%04d' % i, 'name': 'Organization %04d' % i, 'short_name': 'Org %04d' % i, @@ -176,6 +176,58 @@ gsoc2009=gsoc2009) +class OrgApplicationSeeder(Seeder): + def type(self): + return OrgApplication + + def commonSeedArgs(self, request): + _, current_user = ensureUser() + gsoc2009 = Program.get_by_key_name('google/gsoc2009') + + if not gsoc2009: + raise Error('Run seed_db first') + + status = request.GET.get('status', 'pre-accepted') + + return dict(current_user=current_user, + gsoc2009=gsoc2009, + status=status) + + + def seed(self, i, entities=None, current_user=None, gsoc2009=None, + status=None): + properties = { + 'key_name': 'google/gsoc2009/org_%04d' % i, + 'link_id': 'org_%04d' % i, + 'name': 'Org App %04d' % i, + 'scope_path': 'google/gsoc2009', + 'scope': gsoc2009, + 'status': status, + 'applicant': current_user, + 'home_page': 'http://www.google.com', + 'email': 'org@example.com', + 'irc_channel': '#care', + 'pub_mailing_list': 'http://groups.google.com', + 'dev_mailing_list': 'http://groups.google.com', + 'description': 'This is an awesome org!', + 'why_applying': 'Because we can', + 'member_criteria': 'They need to be awesome', + 'license_name': 'Apache License, 2.0', + 'ideas': 'http://code.google.com/p/soc/issues', + 'contrib_disappears': 'We use google to find them', + 'member_disappears': 'See above', + 'encourage_contribs': 'We offer them cookies.', + 'continued_contribs': 'We promise them a cake.', + 'agreed_to_admin_agreement': True, + } + + org_application = OrgApplication(**properties) + if entities is None: + org_application.put() + else: + entities.append(org_application) + + def seed(request, *args, **kwargs): """Seeds the datastore with some default values. """ @@ -322,9 +374,9 @@ } for i in range(10): - org_app_properties['key_name'] = 'google/gsoc2009/wannabe_%d' % i - org_app_properties['link_id'] = 'wannabe_%d' % i - org_app_properties['name'] = 'Wannabe %d' % i + org_app_properties['key_name'] = 'google/gsoc2009/org_%04d' % i + org_app_properties['link_id'] = 'org_%04d' % i + org_app_properties['name'] = 'Org App %04d' % i entity = OrgApplication(**org_app_properties) entity.put() @@ -479,8 +531,8 @@ raise Error('Run seed_db first') properties = { - 'key_name': 'google/gsoc2009/org_app_%d' % i, - 'link_id': 'org_app_%d' % i, + 'key_name': 'google/gsoc2009/org_%d' % i, + 'link_id': 'org_%d' % i, 'name': 'Org App %d' % i, 'scope_path': 'google/gsoc2009', 'scope': gsoc2009, @@ -577,16 +629,16 @@ def seed_student(request, i): """Returns the properties for a new student entity. """ - + gsoc2009 = Program.get_by_key_name('google/gsoc2009') user = User.get_by_key_name('user_%d' % i) - + if not gsoc2009: raise Error('Run seed_db first') - + if not user: raise Error('Run seed_many for at least %d users first.' % i) - + properties = { 'key_name':'google/gsoc2009/student_%d' % i, 'link_id': 'student_%d' % i, @@ -627,13 +679,13 @@ mentor = Mentor.get_by_key_name('google/gsoc2009/org_%d/mentor' % i) user = User.get_by_key_name('user_%d' % i) student = Student.get_by_key_name('google/gsoc2009/student_%d' % i) - + if not user: raise Error('Run seed_many for at least %d users first.' % i) if not student: raise Error('Run seed_many for at least %d students first.' % i) - + if not org: raise Error('Run seed_many for at least %d orgs first.' % i) @@ -669,6 +721,7 @@ SEEDABLE_MODEL_TYPES = { 'user' : UserSeeder(), 'organization' : OrganizationSeeder(), + 'org_application' : OrgApplicationSeeder(), } @@ -716,7 +769,7 @@ # so, we look for what's after the _ and turn it into an int. link_id = highest_instance.link_id if '_' in link_id: - start_index = int(link_id.split('_')[1]) + 1 + start_index = int(link_id.split('_')[-1]) + 1 else: # couldn't find seeded_entities; guessing there are none start_index = 0 diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/models/user.py --- a/app/soc/models/user.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/models/user.py Tue May 26 02:37:39 2009 +0200 @@ -71,6 +71,9 @@ verbose_name=ugettext('User account')) account.help_text = ugettext( 'A valid Google Account.') + + #: Google Account unique user id + user_id = db.StringProperty(required=False) #: A list (possibly empty) of former Google Accounts associated with #: this User. diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/modules/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/soc/modules/__init__.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,17 @@ +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This is the main modules module. +""" diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/modules/callback.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/soc/modules/callback.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,40 @@ +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing Melange callbacks. +""" + +__authors__ = [ + '"Sverre Rabbelier" ', + '"Lennard de Rijk" ', + ] + + +CORE = None + + +def registerCore(core): + """Registers the specified callback as core. + """ + + global CORE + CORE = core + + +def getCore(): + """Returns the Core handler. + """ + + global CORE + return CORE diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/modules/core.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/soc/modules/core.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,235 @@ +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The Melange Core module. +""" + +__authors__ = [ + '"Sverre Rabbelier" ', + '"Lennard de Rijk" ', + ] + + +from django.conf.urls import defaults + +import settings +import soc.cache.sidebar + + +class Error(Exception): + """Error class for the callback module. + """ + + pass + + +class APIVersionMismatch(Error): + """Error raised when API version mismatches. + """ + + MISMATCH_MSG_FMT = "API mismatch, expected '%d', got '%d'." + + def __init__(self, expected, actual): + """Instantiates a new exception with a customized message. + """ + + msg = self.MISMATCH_MSG_FMT % (expected, actual) + super(APIVersionMismatch, self).__init__(msg) + + +class MissingService(Error): + """Error raised when a required service is missing. + """ + + MISSING_SERVICE_FMT = "Required service '%s' is not registered, known: %s" + + def __init__(self, service, services): + """Instantiates a new exception with a customized message. + """ + + msg = self.MISSING_SERVICE_FMT % (service, services) + super(MissingService, self).__init__(msg) + + +class NonUniqueService(Error): + """Error raised when a required service is missing. + """ + + NON_UNIQUE_SERVICE_FMT = "Unique service '%s' called a second time, known: %s." + + def __init__(self, service, services): + """Instantiates a new exception with a customized message. + """ + + msg = self.NON_UNIQUE_SERVICE_FMT % (service, services) + super(NonUniqueService, self).__init__(msg) + + +class Core(object): + """The core handler that controls the Melange API. + """ + + def __init__(self): + """Creates a new instance of the Core. + """ + + self.API_VERSION = 1 + + self.registered_callbacks = [] + self.capability = [] + self.services = [] + + self.sitemap = [] + self.sidebar = [] + + ## + ## internal + ## + + def getService(self, callback, service): + """Retrieves the specified service from the callback if supported. + + Args: + callback: the callback to retrieve the capability from + service: the service to retrieve + """ + + if not hasattr(callback, service): + return False + + func = getattr(callback, service) + + if not callable(func): + return False + + return func + + ## + ## Core code + ## + + def getPatterns(self): + """Returns the Django patterns for this site. + """ + + self.callService('registerWithSitemap', True) + return defaults.patterns(None, *self.sitemap) + + @soc.cache.sidebar.cache + def getSidebar(self, id, user): + """Constructs a sidebar for the current user. + """ + + self.callService('registerWithSidebar', True) + + sidebar = [] + + for i in self.sidebar: + menus = i(id, user) + + for menu in (menus if menus else []): + sidebar.append(menu) + + return sorted(sidebar, key=lambda x: x.get('group')) + + def callService(self, service, unique, *args, **kwargs): + """Calls the specified service on all callbacks. + """ + + if unique and (service in self.services): + return + + results = [] + + for callback in self.registered_callbacks: + func = self.getService(callback, service) + if not func: + continue + + result = func(*args, **kwargs) + results.append(result) + + self.services.append(service) + return results + + def registerModuleCallbacks(self): + """Retrieves all callbacks for the modules of this site. + + Callbacks for modules without a version number or the wrong API_VERSION + number are dropped. They won't be called. + """ + + fmt = settings.MODULE_FMT + modules = ['soc_core'] + settings.MODULES + modules = [__import__(fmt % i, fromlist=['']) for i in modules] + + for callback_class in [i.getCallback() for i in modules]: + if callback_class.API_VERSION != self.API_VERSION: + raise callback.APIVersionMismatch(self.API_VERSION, + callback_class.API_VERSION) + + + callback = callback_class(self) + self.registered_callbacks.append(callback) + + return True + + ## + ## Module code + ## + + def registerCapability(self, capability): + """Registers the specified capability. + """ + + self.capabilities.append(capability) + + def requireCapability(self, capability): + """Requires that the specified capability is present. + """ + + if capability in self.capabilities: + return True + + raise MissingCapability(capability, self.capability) + + def requireService(self, service): + """Requires that the specified service has been called. + """ + + if service in self.services: + return True + + raise MissingService(service, self.services) + + def requireUniqueService(self, service): + """Requires that the specified service is called exactly once. + """ + + if service not in self.services: + return True + + raise NonUniqueService(service, self.services) + + def registerSitemapEntry(self, entries): + """Registers the specified entries with the sitemap. + """ + + self.sitemap.extend(entries) + + def registerSidebarEntry(self, entry): + """Registers the specified entry with the sidebar. + """ + + self.sidebar.append(entry) diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/modules/soc_core/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/soc/modules/soc_core/__init__.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,29 @@ +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This is the main modules module. +""" + +__authors__ = [ + '"Sverre Rabbelier" ', + '"Lennard de Rijk" ', + ] + + + +from soc.modules.soc_core import callback + +def getCallback(): + return callback.Callback diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/modules/soc_core/callback.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/soc/modules/soc_core/callback.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,129 @@ +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module containing the core callback. +""" + +__authors__ = [ + '"Sverre Rabbelier" ', + '"Lennard de Rijk" ', + ] + + +from soc.modules import callback + +from soc.views.models import club +from soc.views.models import club_app +from soc.views.models import club_admin +from soc.views.models import club_member +from soc.views.models import cron +from soc.views.models import document +from soc.views.models import host +from soc.views.models import job +from soc.views.models import mentor +from soc.views.models import notification +from soc.views.models import organization +from soc.views.models import org_admin +from soc.views.models import org_app +from soc.views.models import priority_group +from soc.views.models import program +from soc.views.models import request +from soc.views.models import site +from soc.views.models import sponsor +from soc.views.models import student +from soc.views.models import student_project +from soc.views.models import student_proposal +from soc.views.models import timeline +from soc.views.models import user +from soc.views.models import user_self + + +class Callback(object): + """Callback object that handles interaction between the core. + """ + + API_VERSION = 1 + + def __init__(self, core): + """Initializes a new Callback object for the specified core. + """ + + self.core = core + + # disable clubs + self.enable_clubs = False + + def registerWithSitemap(self): + """Called by the server when sitemap entries should be registered. + """ + + self.core.requireUniqueService('registerWithSitemap') + + if self.enable_clubs: + self.core.registerSitemapEntry(club.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(club_admin.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(club_app.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(club_member.view.getDjangoURLPatterns()) + + self.core.registerSitemapEntry(cron.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(document.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(host.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(job.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(mentor.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(notification.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(organization.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(org_admin.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(org_app.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(priority_group.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(program.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(request.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(site.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(sponsor.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(student.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(student_project.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(student_proposal.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(timeline.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(user_self.view.getDjangoURLPatterns()) + self.core.registerSitemapEntry(user.view.getDjangoURLPatterns()) + + def registerWithSidebar(self): + """Called by the server when sidebar entries should be registered. + """ + + self.core.requireUniqueService('registerWithSidebar') + + if self.enable_clubs: + self.core.registerSidebarEntry(club.view.getSidebarMenus) + self.core.registerSidebarEntry(club.view.getExtraMenus) + self.core.registerSidebarEntry(club_admin.view.getSidebarMenus) + self.core.registerSidebarEntry(club_member.view.getSidebarMenus) + self.core.registerSidebarEntry(club_app.view.getSidebarMenus) + + self.core.registerSidebarEntry(user_self.view.getSidebarMenus) + self.core.registerSidebarEntry(site.view.getSidebarMenus) + self.core.registerSidebarEntry(user.view.getSidebarMenus) + self.core.registerSidebarEntry(sponsor.view.getSidebarMenus) + self.core.registerSidebarEntry(sponsor.view.getExtraMenus) + self.core.registerSidebarEntry(host.view.getSidebarMenus) + self.core.registerSidebarEntry(request.view.getSidebarMenus) + self.core.registerSidebarEntry(program.view.getSidebarMenus) + self.core.registerSidebarEntry(program.view.getExtraMenus) + self.core.registerSidebarEntry(student.view.getSidebarMenus) + self.core.registerSidebarEntry(student_project.view.getSidebarMenus) + self.core.registerSidebarEntry(student_proposal.view.getSidebarMenus) + self.core.registerSidebarEntry(organization.view.getSidebarMenus) + self.core.registerSidebarEntry(organization.view.getExtraMenus) + self.core.registerSidebarEntry(org_admin.view.getSidebarMenus) + self.core.registerSidebarEntry(mentor.view.getSidebarMenus) + self.core.registerSidebarEntry(org_app.view.getSidebarMenus) diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/club_admin/manage.html --- a/app/soc/templates/soc/club_admin/manage.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/club_admin/manage.html Tue May 26 02:37:39 2009 +0200 @@ -23,7 +23,17 @@
    {% endblock %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/club_member/manage.html --- a/app/soc/templates/soc/club_member/manage.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/club_member/manage.html Tue May 26 02:37:39 2009 +0200 @@ -23,7 +23,17 @@ {% endblock %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/host/manage.html --- a/app/soc/templates/soc/host/manage.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/host/manage.html Tue May 26 02:37:39 2009 +0200 @@ -23,7 +23,17 @@ {% endblock %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/mentor/manage.html --- a/app/soc/templates/soc/mentor/manage.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/mentor/manage.html Tue May 26 02:37:39 2009 +0200 @@ -23,7 +23,17 @@ {% endblock %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/models/edit.html --- a/app/soc/templates/soc/models/edit.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/models/edit.html Tue May 26 02:37:39 2009 +0200 @@ -58,11 +58,17 @@ {% endif %} {% endblock %} {% if entity %} {% block delete_button %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/notification/list/row.html --- a/app/soc/templates/soc/notification/list/row.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/notification/list/row.html Tue May 26 02:37:39 2009 +0200 @@ -11,5 +11,5 @@ - + diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/org_admin/manage.html --- a/app/soc/templates/soc/org_admin/manage.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/org_admin/manage.html Tue May 26 02:37:39 2009 +0200 @@ -23,7 +23,17 @@ {% endblock %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/templates/soc/student/manage.html --- a/app/soc/templates/soc/student/manage.html Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/templates/soc/student/manage.html Tue May 26 02:37:39 2009 +0200 @@ -23,7 +23,17 @@ {% endblock %} diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/helper/lists.py --- a/app/soc/views/helper/lists.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/helper/lists.py Tue May 26 02:37:39 2009 +0200 @@ -22,6 +22,7 @@ '"Pawel Solyga" ', ] +import logging from soc.logic import dicts from soc.logic.models.user import logic as user_logic @@ -60,8 +61,6 @@ OFFSET_KEY = 'offset_%d' LIMIT_KEY = 'limit_%d' -OFFSET_KEYNAME_KEY = 'offset_keyname_%d' -REVERSE_DIRECTION_KEY = 'reverse_sort_direction_%d' def makeOffsetKey(limit_idx): @@ -72,14 +71,6 @@ return LIMIT_KEY % limit_idx -def makeOffsetKeynameKey(limit_idx): - return OFFSET_KEYNAME_KEY % limit_idx - - -def makeReverseDirectionKey(limit_idx): - return REVERSE_DIRECTION_KEY % limit_idx - - def getListParameters(request, list_index): """Retrieves, converts and validates values for one list @@ -119,44 +110,30 @@ else: limit = min(DEF_MAX_PAGINATION, limit) - result = dict(limit=limit, offset=offset) - offset_keyname_key = makeOffsetLinkidKey(list_index) - offset_keyname = request.GET.get(offset_keyname_key, '') - # TODO(dbentley): URL unescape - result['offset_keyname'] = offset_keyname - - reverse_direction = makeReverseDirectionKey(list_index) in request.GET - result['reverse_direction'] = reverse_direction - - return result + return dict(limit=limit, offset=offset) -class LinkCreator(object): - """A way to create links for a page. +def generateLinkFromGetArgs(request, offset_and_limits): + """Constructs the get args for the url. """ - def __init__(self, request, list_idx, limit): - self.path = request.path - self.base_params = dict( - i for i in request.GET.iteritems() if - i[0].startswith('offset_') or i[0].startswith('limit_')) - self.idx = list_idx - self.base_params[makeLimitKey(self.idx)] = limit + + args = ["%s=%s" % (k, v) for k, v in offset_and_limits.iteritems()] + link_suffix = '?' + '&'.join(args) + + return request.path + link_suffix + - def create(self, offset_keyname=None, export=False, reverse_direction=False): - params = self.base_params.copy() - if offset_linkid is not None: - # TODO(dbentley): URL encode - if offset_linkid == '': - try: - del params[makeOffsetLinkidKey(self.idx)] - except KeyError: - pass - else: - params[makeOffsetLinkidKey(self.idx)]=offset_linkid - if reverse_direction: - params[makeReverseDirectionKey(self.idx)]=True - link_suffix = '&'.join('%s=%s' % (k, v) for k, v in params.iteritems()) - return '%s?%s' % (self.path, link_suffix) +def generateLinkForRequest(request, base_params, updated_params): + """Create a link to the same page as request but with different params + + Params: + request: the request for the page + base_params: the base parameters + updated_params: the parameters to update + """ + params = base_params.copy() + params.update(updated_params) + return generateLinkFromGetArgs(request, params) def getListContent(request, params, filter=None, order=None, @@ -193,38 +170,16 @@ 'last': offset of the last item in the list } """ - + # TODO(dbentley): this appears to be unnecessary indirection, + # as we only use this logic for getForFields, which is never overridden logic = params['logic'] - limit_key = makeLimitKey(idx) - offset_key = makeOffsetKey(idx) - offset_keyname_key = makeOffsetKeynameKey(idx) - reverse_direction_key = makeReverseDirectionKey(idx) + limit_key, offset_key = makeLimitKey(idx), makeOffsetKey(idx) list_params = getListParameters(request, idx) - - limit = list_params['limit'] - offset = list_params['offset'] - offset_keyname = list_params['offset_keyname'] - reverse_direction = list_params['reverse_direction'] - - pagination_form = makePaginationForm(request, limit, limit_key) - - if offset_keyname: - if filter is None: - filter = {} - - if reverse_direction: - filter['__key__ <'] = offset_keyname - else: - filter['__key__ >'] = offset_keyname - - if order is None: - order = [] - if reverse_direction: - order.append('-__key__') - else: - order.append('__key__') + limit, offset = list_params['limit'], list_params['offset'] + pagination_form = makePaginationForm(request, list_params['limit'], + limit_key) # Fetch one more to see if there should be a 'next' link data = logic.getForFields(filter=filter, limit=limit+1, offset=offset, @@ -234,60 +189,46 @@ return None more = len(data) > limit - if reverse_direction: - data.reverse() if more: - if reverse_direction: - data = data[1:] - else: - data = data[:limit] - - should_have_next_link = True - if not reverse_direction and not more: - should_have_next_link = False - - # Calculating should_have_previous_link is tricky. It's possible we could - # be creating a previous link to a page that would have 0 entities. - # That would be suboptimal; what's a better way? - should_have_previous_link = False - if offset_keyname: - should_have_previous_link = True - if reverse_direction and not more: - should_have_previous_link = False - - if data: - first_key_name = data[0].key().name_or_id() - last_key_name = data[-1].key().name_or_id() - else: - first_key_name = None - last_key_name = None + del data[limit:] newest = next = prev = export_link = '' - link_creator = LinkCreator(request, idx, limit) + base_params = dict(i for i in request.GET.iteritems() if + i[0].startswith('offset_') or i[0].startswith('limit_')) if params.get('list_key_order'): - export_link = link_creator.create(export=True) + export_link = generateLinkForRequest(request, base_params, {'export' : idx}) - if should_have_next_link: - next = link_creator.create(offset_keyname=last_key_name) + if more: + # TODO(dbentley): here we need to implement a new field "last_key" + next = generateLinkForRequest(request, base_params, {offset_key : offset+limit, + limit_key : limit}) - if should_have_previous_link: - prev = link_creator.create(offset_keyname=first_key_name, - reverse_direction=True) + if offset > 0: + # TODO(dbentley): here we need to implement previous in the good way. + prev = generateLinkForRequest(request, base_params, + { offset_key : max(0, offset-limit), + limit_key : limit }) - newest = link_creator.create(offset_keyname='') + if offset > limit: + # Having a link to the first doesn't make sense on the first page (we're on + # it). It also doesn't make sense on the second page (because the first + # page is the previous page). - # TODO(dbentley): add a "last" link (which is now possible because we can - # query with a reverse keyname sorting + # NOTE(dbentley): I personally disagree that it's simpler to do that way, + # because sometimes you want to go to the first page without having to + # consider what page you're on now. + newest = generateLinkForGetArgs(request, base_params, {offset_key : 0, + limit_key : limit}) content = { 'idx': idx, 'data': data, 'export': export_link, - 'first': first_key_name, - 'last': last_key_name, + 'first': offset+1, + 'last': len(data) > 1 and offset+len(data) or None, 'logic': logic, 'limit': limit, 'newest': newest, diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/helper/params.py --- a/app/soc/views/helper/params.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/helper/params.py Tue May 26 02:37:39 2009 +0200 @@ -128,7 +128,7 @@ new_params['missing_redirect'] = '/%(url_name)s/create' % params new_params['delete_redirect'] = '/%(url_name)s/list' % params new_params['invite_redirect'] = '/request/list' - new_params['edit_cancel_redirect'] = '/%(url_name)s/list' % params + # new_params['cancel_redirect'] = '/%(url_name)s/list' % params new_params['public_redirect'] = None new_params['sidebar'] = None diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/helper/responses.py --- a/app/soc/views/helper/responses.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/helper/responses.py Tue May 26 02:37:39 2009 +0200 @@ -33,6 +33,7 @@ from soc.logic import system from soc.logic.models import site from soc.logic.models.user import logic as user_logic +from soc.modules import callback from soc.views import helper from soc.views.helper import redirects from soc.views.helper import templates @@ -125,7 +126,7 @@ context['sign_in'] = users.create_login_url(request.path) context['sign_out'] = users.create_logout_url(request.path) - context['sidebar_menu_items'] = sidebar.getSidebar(account, user) + context['sidebar_menu_items'] = callback.getCore().getSidebar(account, user) context['gae_version'] = system.getAppVersion() context['soc_release'] = system.getMelangeVersion() diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/models/base.py --- a/app/soc/views/models/base.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/models/base.py Tue May 26 02:37:39 2009 +0200 @@ -41,7 +41,8 @@ from soc.views.helper import redirects from soc.views.helper import requests from soc.views.helper import responses -from soc.views import sitemap +from soc.views.sitemap import sidebar +from soc.views.sitemap import sitemap import soc.cache.logic import soc.logic @@ -914,7 +915,7 @@ context['entity_type_plural'] = params['name_plural'] context['entity_type_short'] = params['name_short'] context['entity_type_url'] = params['url_name'] - context['edit_cancel_redirect'] = params.get('edit_cancel_redirect') + context['cancel_redirect'] = params.get('cancel_redirect') context['return_url'] = request.path if params.get('export_content_type') and entity: @@ -952,7 +953,7 @@ of _getSidebarItems on how it uses it. """ - return sitemap.sidebar.getSidebarMenus(id, user, params=params) + return sidebar.getSidebarMenus(id, user, params=params) @decorators.merge_params def getDjangoURLPatterns(self, params=None): @@ -967,5 +968,5 @@ params: a dict with params for this View """ - return sitemap.sitemap.getDjangoURLPatterns(params) + return sitemap.getDjangoURLPatterns(params) diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/models/organization.py --- a/app/soc/views/models/organization.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/models/organization.py Tue May 26 02:37:39 2009 +0200 @@ -554,6 +554,11 @@ ap_list = lists.getListContent(request, ap_params, filter, idx=0, need_content=True) + # this is a temporary fix for sorting Student Projects + # by Student name until we have a view that default + # sorts it self by name (right now we can't do such query) + ap_list['data'].sort(key=lambda sp: sp.student.name().lower()) + contents = [] if ap_list: diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/models/student_project.py --- a/app/soc/views/models/student_project.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/models/student_project.py Tue May 26 02:37:39 2009 +0200 @@ -552,7 +552,7 @@ responses.useJavaScript(context, params['js_uses_all']) context['page_name'] = page_name # cancel should go to the public view - params['edit_cancel_redirect'] = redirects.getPublicRedirect(entity, params) + params['cancel_redirect'] = redirects.getPublicRedirect(entity, params) if request.POST: return self.stEditPost(request, context, params, entity, **kwargs) diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/sitemap/build.py --- a/app/soc/views/sitemap/build.py Mon May 25 23:42:15 2009 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -#!/usr/bin/python2.5 -# -# Copyright 2008 the Melange authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Module that constructs the sitemap. -""" - -__authors__ = [ - '"Sverre Rabbelier" ', - ] - - -from django.conf.urls import defaults - -#from soc.views.models import club -#from soc.views.models import club_app -#from soc.views.models import club_admin -#from soc.views.models import club_member -from soc.views.models import cron -from soc.views.models import document -from soc.views.models import host -from soc.views.models import job -from soc.views.models import mentor -from soc.views.models import notification -from soc.views.models import organization -from soc.views.models import org_admin -from soc.views.models import org_app -from soc.views.models import priority_group -from soc.views.models import program -from soc.views.models import request -from soc.views.models import site -from soc.views.models import sponsor -from soc.views.models import student -from soc.views.models import student_project -from soc.views.models import student_proposal -from soc.views.models import timeline -from soc.views.models import user -from soc.views.models import user_self - -from soc.views.sitemap import sidebar -from soc.views.sitemap import sitemap - - -# TODO: instead of commenting out club stuff, make it depend on a setting - - -sidebar.addMenu(user_self.view.getSidebarMenus) -#sidebar.addMenu(club.view.getSidebarMenus) -#sidebar.addMenu(club.view.getExtraMenus) -#sidebar.addMenu(club_admin.view.getSidebarMenus) -#sidebar.addMenu(club_member.view.getSidebarMenus) -#sidebar.addMenu(club_app.view.getSidebarMenus) -sidebar.addMenu(site.view.getSidebarMenus) -sidebar.addMenu(user.view.getSidebarMenus) -#sidebar.addMenu(document.view.getSidebarMenus) -sidebar.addMenu(sponsor.view.getSidebarMenus) -sidebar.addMenu(sponsor.view.getExtraMenus) -sidebar.addMenu(host.view.getSidebarMenus) -sidebar.addMenu(request.view.getSidebarMenus) -sidebar.addMenu(program.view.getSidebarMenus) -sidebar.addMenu(program.view.getExtraMenus) -sidebar.addMenu(student.view.getSidebarMenus) -sidebar.addMenu(student_project.view.getSidebarMenus) -sidebar.addMenu(student_proposal.view.getSidebarMenus) -sidebar.addMenu(organization.view.getSidebarMenus) -sidebar.addMenu(organization.view.getExtraMenus) -sidebar.addMenu(org_admin.view.getSidebarMenus) -sidebar.addMenu(mentor.view.getSidebarMenus) -sidebar.addMenu(org_app.view.getSidebarMenus) - -#sitemap.addPages(club.view.getDjangoURLPatterns()) -#sitemap.addPages(club_admin.view.getDjangoURLPatterns()) -#sitemap.addPages(club_app.view.getDjangoURLPatterns()) -#sitemap.addPages(club_member.view.getDjangoURLPatterns()) -sitemap.addPages(cron.view.getDjangoURLPatterns()) -sitemap.addPages(document.view.getDjangoURLPatterns()) -sitemap.addPages(host.view.getDjangoURLPatterns()) -sitemap.addPages(job.view.getDjangoURLPatterns()) -sitemap.addPages(mentor.view.getDjangoURLPatterns()) -sitemap.addPages(notification.view.getDjangoURLPatterns()) -sitemap.addPages(organization.view.getDjangoURLPatterns()) -sitemap.addPages(org_admin.view.getDjangoURLPatterns()) -sitemap.addPages(org_app.view.getDjangoURLPatterns()) -sitemap.addPages(priority_group.view.getDjangoURLPatterns()) -sitemap.addPages(program.view.getDjangoURLPatterns()) -sitemap.addPages(request.view.getDjangoURLPatterns()) -sitemap.addPages(site.view.getDjangoURLPatterns()) -sitemap.addPages(sponsor.view.getDjangoURLPatterns()) -sitemap.addPages(student.view.getDjangoURLPatterns()) -sitemap.addPages(student_project.view.getDjangoURLPatterns()) -sitemap.addPages(student_proposal.view.getDjangoURLPatterns()) -sitemap.addPages(timeline.view.getDjangoURLPatterns()) -sitemap.addPages(user_self.view.getDjangoURLPatterns()) -sitemap.addPages(user.view.getDjangoURLPatterns()) - - -def getPatterns(): - """Retrieves all the url patterns of this site. - """ - return defaults.patterns(None, *sitemap.SITEMAP) diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/sitemap/sidebar.py --- a/app/soc/views/sitemap/sidebar.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/sitemap/sidebar.py Tue May 26 02:37:39 2009 +0200 @@ -24,38 +24,11 @@ from soc.views import out_of_band -import soc.cache.sidebar - -SIDEBAR = [] SIDEBAR_ACCESS_ARGS = ['SIDEBAR_CALLING'] SIDEBAR_ACCESS_KWARGS = {'SIDEBAR_CALLING': True} -def addMenu(callback): - """Adds a callback to the menu builder. - - The callback should return a list of menu's when called. - """ - global SIDEBAR - SIDEBAR.append(callback) - - -@soc.cache.sidebar.cache -def getSidebar(id, user): - """Constructs a sidebar for the current user. - """ - - sidebar = [] - - for callback in SIDEBAR: - menus = callback(id, user) - - for menu in (menus if menus else []): - sidebar.append(menu) - - return sorted(sidebar, key=lambda x: x.get('group')) - def getSidebarItems(params): """Retrieves a list of sidebar entries for this view. diff -r 3156760b4d26 -r 4cc66ab098e8 app/soc/views/sitemap/sitemap.py --- a/app/soc/views/sitemap/sitemap.py Mon May 25 23:42:15 2009 +0200 +++ b/app/soc/views/sitemap/sitemap.py Tue May 26 02:37:39 2009 +0200 @@ -22,17 +22,6 @@ ] -SITEMAP = [] - - -def addPages(pages): - """Adds the specified pages to the sitemap. - """ - - global SITEMAP - SITEMAP += pages - - def getDjangoURLPatterns(params): """Retrieves a list of sidebar entries for this View. diff -r 3156760b4d26 -r 4cc66ab098e8 app/urls.py --- a/app/urls.py Mon May 25 23:42:15 2009 +0200 +++ b/app/urls.py Tue May 26 02:37:39 2009 +0200 @@ -18,15 +18,15 @@ __authors__ = [ '"Augie Fackler" ', '"Todd Larsen" ', + '"Sverre Rabbelier" ', '"Lennard de Rijk" ', '"Pawel Solyga" ', ] -from soc.views.sitemap import build +from soc.modules import callback - -urlpatterns = build.getPatterns() +urlpatterns = callback.getCore().getPatterns() # define the error handlers handler404 = 'django.views.defaults.page_not_found' diff -r 3156760b4d26 -r 4cc66ab098e8 scripts/build.sh --- a/scripts/build.sh Mon May 25 23:42:15 2009 +0200 +++ b/scripts/build.sh Tue May 26 02:37:39 2009 +0200 @@ -10,8 +10,8 @@ DEFAULT_APP_BUILD=../build DEFAULT_APP_FOLDER="../app" -DEFAULT_APP_FILES="app.yaml cron.yaml index.yaml main.py settings.py urls.py" -DEFAULT_APP_DIRS="soc ghop gsoc feedparser python25src reflistprop jquery ranklist json" +DEFAULT_APP_FILES="app.yaml cron.yaml index.yaml main.py settings.py shell.py urls.py gae_django.py" +DEFAULT_APP_DIRS="soc ghop gsoc feedparser python25src reflistprop jquery ranklist shell json htmlsanitizer" DEFAULT_ZIP_FILES="tiny_mce.zip" APP_BUILD=${APP_BUILD:-"${DEFAULT_APP_BUILD}"} @@ -20,6 +20,17 @@ APP_DIRS=${APP_DIRS:-"${DEFAULT_APP_DIRS}"} ZIP_FILES=${ZIP_FILES:-"${DEFAULT_ZIP_FILES}"} + +if [ "$1" != "--skip-pylint" ]; then + cd pylint + bash do_pylint.sh --silent + if [ "$?" != "1" ] ; then + echo ' Build failed. Build script encountered pylint errors.' + exit 1 + fi + cd .. +fi + if [ -e $APP_FOLDER ] ; then cd $APP_FOLDER else diff -r 3156760b4d26 -r 4cc66ab098e8 scripts/pylint/do_pylint.sh --- a/scripts/pylint/do_pylint.sh Mon May 25 23:42:15 2009 +0200 +++ b/scripts/pylint/do_pylint.sh Tue May 26 02:37:39 2009 +0200 @@ -36,7 +36,6 @@ PROJ_DIR=$(cd "$PROJ_DIR"; pwd) APP_DIR="${PROJ_DIR}/app" -# Note: We will add ghop and gsoc modules once there something in there CHECK_MODULES="soc reflistprop settings.py urls.py main.py" PYLINTRC=$(dirname "$0")/pylintrc @@ -60,3 +59,4 @@ done pylint $SILENT_ARGS $ARGS $CHECK_MODULES_PATHS +exit $? \ No newline at end of file diff -r 3156760b4d26 -r 4cc66ab098e8 scripts/stats.py --- a/scripts/stats.py Mon May 25 23:42:15 2009 +0200 +++ b/scripts/stats.py Tue May 26 02:37:39 2009 +0200 @@ -277,6 +277,21 @@ job_logic.updateOrCreateFromFields(job_fields) +def startUniqueUserIdConversion(): + """Creates the job that is responsible for adding unique user ids. + """ + + from soc.logic.models.job import logic as job_logic + from soc.logic.models.priority_group import logic as priority_logic + + priority_group = priority_logic.getGroup(priority_logic.CONVERT) + job_fields = { + 'priority_group': priority_group, + 'task_name': 'setupUniqueUserIdAdder'} + + job_logic.updateOrCreateFromFields(job_fields) + + def reviveJobs(amount): """Sets jobs that are stuck in 'aborted' to waiting. @@ -357,6 +372,87 @@ cPickle.dump(target, f) +def acceptedStudentsCSVExport(csv_filename, program_key_name): + """Exports all accepted Students for particular program into CSV file. + """ + # TODO(Pawel.Solyga): Add additional Program parameter to this method + # so we export students from different programs + # TODO(Pawel.SOlyga): Make it universal so it works with both GHOP + # and GSoC programs + + from soc.models.student_project import StudentProject + from soc.models.student import Student + from soc.models.organization import Organization + + getStudentProjects = getEntities(StudentProject) + student_projects = getStudentProjects() + student_projects_amount = len(student_projects) + print "Fetched %d Student Projects." % student_projects_amount + print "Fetching Student entities from Student Projects." + accepted_students = {} + student_organization = {} + counter = 0 + for sp_key in student_projects.keys(): + key = student_projects[sp_key].student.key().name() + accepted_students[key] = student_projects[sp_key].student + org_name = student_projects[sp_key].scope.name + student_organization[key] = org_name + counter += 1 + print str(counter) + '/' + str(student_projects_amount) + ' ' + key + ' (' + org_name + ')' + print "All Student entities fetched." + + students_key_order = ['link_id', 'given_name', 'surname', + 'name_on_documents', 'email', 'res_street', 'res_city', 'res_state', + 'res_country', 'res_postalcode', 'phone', 'ship_street', 'ship_city', + 'ship_state', 'ship_country', 'ship_postalcode', 'birth_date', + 'tshirt_size', 'tshirt_style', 'name', 'school_name', 'school_country', + 'major', 'degree'] + + print "Preparing Students data for export." + students_data = [accepted_students[i].toDict(students_key_order) for i in accepted_students.keys()] + + print "Adding organization name to Students data." + for student in students_data: + student['organization'] = student_organization[program_key_name + '/' + student['link_id']] + + students_key_order.append('organization') + + saveDataToCSV(csv_filename, students_data, students_key_order) + print "Accepted Students exported to %s file." % csv_filename + + +def saveDataToCSV(csv_filename, data, key_order): + """Saves data in order into CSV file. + + This is a helper function used with acceptedStudentsCSVExport(). + """ + + import csv + import StringIO + + from soc.logic import dicts + + file_handler = StringIO.StringIO() + + writer = csv.DictWriter(file_handler, key_order, dialect='excel') + writer.writerow(dicts.identity(key_order)) + + # encode the data to UTF-8 to ensure compatibiliy + for row_dict in data: + for key in row_dict.keys(): + value = row_dict[key] + if isinstance(value, basestring): + row_dict[key] = value.encode("utf-8") + else: + row_dict[key] = str(value) + writer.writerow(row_dict) + + csv_data = file_handler.getvalue() + csv_file = open(csv_filename, 'w') + csv_file.write(csv_data) + csv_file.close() + + def main(args): """Main routine. """ @@ -411,6 +507,8 @@ 'startSpam': startSpam, 'reviveJobs': reviveJobs, 'deidleJobs': deidleJobs, + 'acceptedStudentsCSVExport': acceptedStudentsCSVExport, + 'startUniqueUserIdConversion': startUniqueUserIdConversion, } interactive.remote(args, context) diff -r 3156760b4d26 -r 4cc66ab098e8 tests/run.py --- a/tests/run.py Mon May 25 23:42:15 2009 +0200 +++ b/tests/run.py Tue May 26 02:37:39 2009 +0200 @@ -9,6 +9,7 @@ os.path.join(appengine_location, 'lib', 'django'), os.path.join(appengine_location, 'lib', 'webob'), os.path.join(appengine_location, 'lib', 'yaml', 'lib'), + os.path.join(appengine_location, 'lib', 'antlr3'), appengine_location, os.path.join(HERE, 'app'), os.path.join(HERE, 'thirdparty', 'coverage'), @@ -32,7 +33,9 @@ def afterTest(self, test): from google.appengine.api import apiproxy_stub_map datastore = apiproxy_stub_map.apiproxy.GetStub('datastore') - datastore.Clear() + # clear datastore iff one is available + if datastore is not None: + datastore.Clear() def main(): diff -r 3156760b4d26 -r 4cc66ab098e8 tests/test_functional.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_functional.py Tue May 26 02:37:39 2009 +0200 @@ -0,0 +1,100 @@ +#!/usr/bin/python2.5 +# +# Copyright 2009 the Melange authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +__authors__ = [ + '"Matthew Wilkes" ', + ] + + +from gaeftest.test import FunctionalTestCase + +from zope.testbrowser import browser + +import os.path + + +class MelangeFunctionalTestCase(FunctionalTestCase): + """A base class for all functional tests in Melange. + + Tests MUST NOT be defined here, but the superclass requires a path + attribute that points to the app.yaml. Utility functions MAY be + declared here to be shared by all functional tests, but any + overridden unittest methods MUST call the superclass version. + """ + + path = os.path.abspath(__file__+"/../../app/app.yaml") + + +class TestBranding(MelangeFunctionalTestCase): + """Tests that ensure Melange properly displays attribution. + + Other notices, as required by the project and/or law, are tested + here as well. + """ + + def test_attribution(self): + """Ensure that the front page asserts that it is a Melange app. + """ + + tb = browser.Browser() + tb.open("http://127.0.0.1:8080/site/show/site") + + self.assertTrue("Powered by Melange" in tb.contents) + + +class TestLogin(MelangeFunctionalTestCase): + """Tests that check the login system is functioning correctly. + + Also tests that users go through the correct registration workflow. + """ + + def test_firstLogin(self): + """Ensure that new users are prompted to create a profile. + + Also test that only new users are prompted. + """ + + tb = browser.Browser() + tb.open("http://127.0.0.1:8080") + + tb.getLink("Sign in").click() + self.assertTrue("login" in tb.url) + + # fill in dev_appserver login form + tb.getForm().getControl("Email").value = "newuser@example.com" + tb.getForm().getControl("Login").click() + + self.assertTrue(tb.url.endswith("/show/site")) + self.assertTrue('Please create ' + 'User Profile in order to view this page' in tb.contents) + + tb.getLink("User Profile").click() + + # fill in the user profile + cp = tb.getForm(action="create_profile") + cp.getControl(name="link_id").value = "exampleuser" + cp.getControl(name="name").value = "Example user" + cp.getControl("Save").click() + + # if all is well, we go to the edit page + self.assertTrue("edit_profile" in tb.url) + + tb.open("http://127.0.0.1:8080") + + # call to action no longer on front page + self.assertFalse('Please create ' + 'User Profile in order to view this page' in tb.contents) \ No newline at end of file
    foo Please select the appropriate action:
    - + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}
    Please select the appropriate action:
    - + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}
    Please select the appropriate action:
    - + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}
    Please select the appropriate action:
    - + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}
    - {% if edit_cancel_redirect %} - - {% else %} - - {% endif %} + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}
    {{ list.item.subject }}
    {{ list.item.created_on }}
    {{ list.item.created_on|date:"jS F Y H:i" }}
    Please select the appropriate action:
    - + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}
    Please select the appropriate action:
    - + {% if cancel_redirect %} + + {% endif %} + value="Cancel"/> + {% else %} + + {% endif %}