app/django/utils/text.py
author Todd Larsen <tlarsen@google.com>
Mon, 29 Sep 2008 15:46:42 +0000
changeset 208 e076aee6e90f
parent 54 03e267d67478
child 323 ff1a9aa48cfd
permissions -rw-r--r--
Take advantage of the Model inheritance provided by polymodel.PolyModel to have Club, School, Sponsor, and Organization actually inherit from the Group Model class, rather than being composed via ReferenceProperties. Patch by: Todd Larsen Review by: Pawel Solyga, Sverre Rabbelier, Augie Fackler Review URL: http://codereviews.googleopensourceprograms.com/606
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
54
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     1
import re
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     2
from django.conf import settings
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     3
from django.utils.encoding import force_unicode
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     4
from django.utils.functional import allow_lazy
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     5
from django.utils.translation import ugettext_lazy
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     6
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     7
# Capitalizes the first letter of a string.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     8
capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
     9
capfirst = allow_lazy(capfirst, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    10
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    11
def wrap(text, width):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    12
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    13
    A word-wrap function that preserves existing line breaks and most spaces in
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    14
    the text. Expects that existing line breaks are posix newlines.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    15
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    16
    text = force_unicode(text)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    17
    def _generator():
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    18
        it = iter(text.split(' '))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    19
        word = it.next()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    20
        yield word
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    21
        pos = len(word) - word.rfind('\n') - 1
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    22
        for word in it:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    23
            if "\n" in word:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    24
                lines = word.split('\n')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    25
            else:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    26
                lines = (word,)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    27
            pos += len(lines[0]) + 1
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    28
            if pos > width:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    29
                yield '\n'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    30
                pos = len(lines[-1])
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    31
            else:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    32
                yield ' '
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    33
                if len(lines) > 1:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    34
                    pos = len(lines[-1])
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    35
            yield word
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    36
    return u''.join(_generator())
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    37
wrap = allow_lazy(wrap, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    38
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    39
def truncate_words(s, num):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    40
    "Truncates a string after a certain number of words."
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    41
    s = force_unicode(s)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    42
    length = int(num)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    43
    words = s.split()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    44
    if len(words) > length:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    45
        words = words[:length]
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    46
        if not words[-1].endswith('...'):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    47
            words.append('...')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    48
    return u' '.join(words)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    49
truncate_words = allow_lazy(truncate_words, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    50
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    51
def truncate_html_words(s, num):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    52
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    53
    Truncates html to a certain number of words (not counting tags and
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    54
    comments). Closes opened tags if they were correctly closed in the given
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    55
    html.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    56
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    57
    s = force_unicode(s)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    58
    length = int(num)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    59
    if length <= 0:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    60
        return u''
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    61
    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    62
    # Set up regular expressions
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    63
    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    64
    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    65
    # Count non-HTML words and keep note of open tags
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    66
    pos = 0
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    67
    ellipsis_pos = 0
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    68
    words = 0
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    69
    open_tags = []
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    70
    while words <= length:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    71
        m = re_words.search(s, pos)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    72
        if not m:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    73
            # Checked through whole string
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    74
            break
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    75
        pos = m.end(0)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    76
        if m.group(1):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    77
            # It's an actual non-HTML word
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    78
            words += 1
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    79
            if words == length:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    80
                ellipsis_pos = pos
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    81
            continue
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    82
        # Check for tag
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    83
        tag = re_tag.match(m.group(0))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    84
        if not tag or ellipsis_pos:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    85
            # Don't worry about non tags or tags after our truncate point
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    86
            continue
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    87
        closing_tag, tagname, self_closing = tag.groups()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    88
        tagname = tagname.lower()  # Element names are always case-insensitive
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    89
        if self_closing or tagname in html4_singlets:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    90
            pass
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    91
        elif closing_tag:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    92
            # Check for match in open tags list
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    93
            try:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    94
                i = open_tags.index(tagname)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    95
            except ValueError:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    96
                pass
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    97
            else:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    98
                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
    99
                open_tags = open_tags[i+1:]
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   100
        else:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   101
            # Add it to the start of the open tags list
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   102
            open_tags.insert(0, tagname)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   103
    if words <= length:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   104
        # Don't try to close tags if we don't need to truncate
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   105
        return s
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   106
    out = s[:ellipsis_pos] + ' ...'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   107
    # Close any tags still open
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   108
    for tag in open_tags:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   109
        out += '</%s>' % tag
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   110
    # Return string
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   111
    return out
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   112
truncate_html_words = allow_lazy(truncate_html_words, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   113
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   114
def get_valid_filename(s):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   115
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   116
    Returns the given string converted to a string that can be used for a clean
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   117
    filename. Specifically, leading and trailing spaces are removed; other
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   118
    spaces are converted to underscores; and all non-filename-safe characters
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   119
    are removed.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   120
    >>> get_valid_filename("john's portrait in 2004.jpg")
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   121
    'johns_portrait_in_2004.jpg'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   122
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   123
    s = force_unicode(s).strip().replace(' ', '_')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   124
    return re.sub(r'[^-A-Za-z0-9_.]', '', s)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   125
get_valid_filename = allow_lazy(get_valid_filename, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   126
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   127
def get_text_list(list_, last_word=ugettext_lazy(u'or')):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   128
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   129
    >>> get_text_list(['a', 'b', 'c', 'd'])
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   130
    'a, b, c or d'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   131
    >>> get_text_list(['a', 'b', 'c'], 'and')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   132
    'a, b and c'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   133
    >>> get_text_list(['a', 'b'], 'and')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   134
    'a and b'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   135
    >>> get_text_list(['a'])
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   136
    'a'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   137
    >>> get_text_list([])
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   138
    ''
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   139
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   140
    if len(list_) == 0: return u''
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   141
    if len(list_) == 1: return force_unicode(list_[0])
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   142
    return u'%s %s %s' % (', '.join([force_unicode(i) for i in list_][:-1]), force_unicode(last_word), force_unicode(list_[-1]))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   143
get_text_list = allow_lazy(get_text_list, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   144
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   145
def normalize_newlines(text):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   146
    return force_unicode(re.sub(r'\r\n|\r|\n', '\n', text))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   147
normalize_newlines = allow_lazy(normalize_newlines, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   148
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   149
def recapitalize(text):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   150
    "Recapitalizes text, placing caps after end-of-sentence punctuation."
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   151
    text = force_unicode(text).lower()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   152
    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   153
    text = capsRE.sub(lambda x: x.group(1).upper(), text)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   154
    return text
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   155
recapitalize = allow_lazy(recapitalize)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   156
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   157
def phone2numeric(phone):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   158
    "Converts a phone number with letters into its numeric equivalent."
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   159
    letters = re.compile(r'[A-PR-Y]', re.I)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   160
    char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   161
         'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   162
         'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   163
         's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   164
         'y': '9', 'x': '9'}.get(m.group(0).lower())
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   165
    return letters.sub(char2number, phone)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   166
phone2numeric = allow_lazy(phone2numeric)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   167
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   168
# From http://www.xhaus.com/alan/python/httpcomp.html#gzip
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   169
# Used with permission.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   170
def compress_string(s):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   171
    import cStringIO, gzip
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   172
    zbuf = cStringIO.StringIO()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   173
    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   174
    zfile.write(s)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   175
    zfile.close()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   176
    return zbuf.getvalue()
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   177
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   178
ustring_re = re.compile(u"([\u0080-\uffff])")
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   179
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   180
def javascript_quote(s, quote_double_quotes=False):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   181
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   182
    def fix(match):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   183
        return r"\u%04x" % ord(match.group(1))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   184
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   185
    if type(s) == str:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   186
        s = s.decode('utf-8')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   187
    elif type(s) != unicode:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   188
        raise TypeError, s
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   189
    s = s.replace('\\', '\\\\')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   190
    s = s.replace('\r', '\\r')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   191
    s = s.replace('\n', '\\n')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   192
    s = s.replace('\t', '\\t')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   193
    s = s.replace("'", "\\'")
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   194
    if quote_double_quotes:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   195
        s = s.replace('"', '&quot;')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   196
    return str(ustring_re.sub(fix, s))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   197
javascript_quote = allow_lazy(javascript_quote, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   198
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   199
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   200
def smart_split(text):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   201
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   202
    Generator that splits a string by spaces, leaving quoted phrases together.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   203
    Supports both single and double quotes, and supports escaping quotes with
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   204
    backslashes. In the output, strings will keep their initial and trailing
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   205
    quote marks.
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   206
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   207
    >>> list(smart_split('This is "a person\'s" test.'))
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   208
    ['This', 'is', '"a person\'s"', 'test.']
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   209
    """
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   210
    text = force_unicode(text)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   211
    for bit in smart_split_re.finditer(text):
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   212
        bit = bit.group(0)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   213
        if bit[0] == '"' and bit[-1] == '"':
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   214
            yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   215
        elif bit[0] == "'" and bit[-1] == "'":
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   216
            yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   217
        else:
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   218
            yield bit
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   219
smart_split = allow_lazy(smart_split, unicode)
03e267d67478 Major reorganization of the soc svn repo, to merge into a single App Engine
Todd Larsen <tlarsen@google.com>
parents:
diff changeset
   220