app/django/utils/text.py
changeset 54 03e267d67478
child 323 ff1a9aa48cfd
equal deleted inserted replaced
53:57b4279d8c4e 54:03e267d67478
       
     1 import re
       
     2 from django.conf import settings
       
     3 from django.utils.encoding import force_unicode
       
     4 from django.utils.functional import allow_lazy
       
     5 from django.utils.translation import ugettext_lazy
       
     6 
       
     7 # Capitalizes the first letter of a string.
       
     8 capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
       
     9 capfirst = allow_lazy(capfirst, unicode)
       
    10 
       
    11 def wrap(text, width):
       
    12     """
       
    13     A word-wrap function that preserves existing line breaks and most spaces in
       
    14     the text. Expects that existing line breaks are posix newlines.
       
    15     """
       
    16     text = force_unicode(text)
       
    17     def _generator():
       
    18         it = iter(text.split(' '))
       
    19         word = it.next()
       
    20         yield word
       
    21         pos = len(word) - word.rfind('\n') - 1
       
    22         for word in it:
       
    23             if "\n" in word:
       
    24                 lines = word.split('\n')
       
    25             else:
       
    26                 lines = (word,)
       
    27             pos += len(lines[0]) + 1
       
    28             if pos > width:
       
    29                 yield '\n'
       
    30                 pos = len(lines[-1])
       
    31             else:
       
    32                 yield ' '
       
    33                 if len(lines) > 1:
       
    34                     pos = len(lines[-1])
       
    35             yield word
       
    36     return u''.join(_generator())
       
    37 wrap = allow_lazy(wrap, unicode)
       
    38 
       
    39 def truncate_words(s, num):
       
    40     "Truncates a string after a certain number of words."
       
    41     s = force_unicode(s)
       
    42     length = int(num)
       
    43     words = s.split()
       
    44     if len(words) > length:
       
    45         words = words[:length]
       
    46         if not words[-1].endswith('...'):
       
    47             words.append('...')
       
    48     return u' '.join(words)
       
    49 truncate_words = allow_lazy(truncate_words, unicode)
       
    50 
       
    51 def truncate_html_words(s, num):
       
    52     """
       
    53     Truncates html to a certain number of words (not counting tags and
       
    54     comments). Closes opened tags if they were correctly closed in the given
       
    55     html.
       
    56     """
       
    57     s = force_unicode(s)
       
    58     length = int(num)
       
    59     if length <= 0:
       
    60         return u''
       
    61     html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
       
    62     # Set up regular expressions
       
    63     re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
       
    64     re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
       
    65     # Count non-HTML words and keep note of open tags
       
    66     pos = 0
       
    67     ellipsis_pos = 0
       
    68     words = 0
       
    69     open_tags = []
       
    70     while words <= length:
       
    71         m = re_words.search(s, pos)
       
    72         if not m:
       
    73             # Checked through whole string
       
    74             break
       
    75         pos = m.end(0)
       
    76         if m.group(1):
       
    77             # It's an actual non-HTML word
       
    78             words += 1
       
    79             if words == length:
       
    80                 ellipsis_pos = pos
       
    81             continue
       
    82         # Check for tag
       
    83         tag = re_tag.match(m.group(0))
       
    84         if not tag or ellipsis_pos:
       
    85             # Don't worry about non tags or tags after our truncate point
       
    86             continue
       
    87         closing_tag, tagname, self_closing = tag.groups()
       
    88         tagname = tagname.lower()  # Element names are always case-insensitive
       
    89         if self_closing or tagname in html4_singlets:
       
    90             pass
       
    91         elif closing_tag:
       
    92             # Check for match in open tags list
       
    93             try:
       
    94                 i = open_tags.index(tagname)
       
    95             except ValueError:
       
    96                 pass
       
    97             else:
       
    98                 # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
       
    99                 open_tags = open_tags[i+1:]
       
   100         else:
       
   101             # Add it to the start of the open tags list
       
   102             open_tags.insert(0, tagname)
       
   103     if words <= length:
       
   104         # Don't try to close tags if we don't need to truncate
       
   105         return s
       
   106     out = s[:ellipsis_pos] + ' ...'
       
   107     # Close any tags still open
       
   108     for tag in open_tags:
       
   109         out += '</%s>' % tag
       
   110     # Return string
       
   111     return out
       
   112 truncate_html_words = allow_lazy(truncate_html_words, unicode)
       
   113 
       
   114 def get_valid_filename(s):
       
   115     """
       
   116     Returns the given string converted to a string that can be used for a clean
       
   117     filename. Specifically, leading and trailing spaces are removed; other
       
   118     spaces are converted to underscores; and all non-filename-safe characters
       
   119     are removed.
       
   120     >>> get_valid_filename("john's portrait in 2004.jpg")
       
   121     'johns_portrait_in_2004.jpg'
       
   122     """
       
   123     s = force_unicode(s).strip().replace(' ', '_')
       
   124     return re.sub(r'[^-A-Za-z0-9_.]', '', s)
       
   125 get_valid_filename = allow_lazy(get_valid_filename, unicode)
       
   126 
       
   127 def get_text_list(list_, last_word=ugettext_lazy(u'or')):
       
   128     """
       
   129     >>> get_text_list(['a', 'b', 'c', 'd'])
       
   130     'a, b, c or d'
       
   131     >>> get_text_list(['a', 'b', 'c'], 'and')
       
   132     'a, b and c'
       
   133     >>> get_text_list(['a', 'b'], 'and')
       
   134     'a and b'
       
   135     >>> get_text_list(['a'])
       
   136     'a'
       
   137     >>> get_text_list([])
       
   138     ''
       
   139     """
       
   140     if len(list_) == 0: return u''
       
   141     if len(list_) == 1: return force_unicode(list_[0])
       
   142     return u'%s %s %s' % (', '.join([force_unicode(i) for i in list_][:-1]), force_unicode(last_word), force_unicode(list_[-1]))
       
   143 get_text_list = allow_lazy(get_text_list, unicode)
       
   144 
       
   145 def normalize_newlines(text):
       
   146     return force_unicode(re.sub(r'\r\n|\r|\n', '\n', text))
       
   147 normalize_newlines = allow_lazy(normalize_newlines, unicode)
       
   148 
       
   149 def recapitalize(text):
       
   150     "Recapitalizes text, placing caps after end-of-sentence punctuation."
       
   151     text = force_unicode(text).lower()
       
   152     capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
       
   153     text = capsRE.sub(lambda x: x.group(1).upper(), text)
       
   154     return text
       
   155 recapitalize = allow_lazy(recapitalize)
       
   156 
       
   157 def phone2numeric(phone):
       
   158     "Converts a phone number with letters into its numeric equivalent."
       
   159     letters = re.compile(r'[A-PR-Y]', re.I)
       
   160     char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
       
   161          'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
       
   162          'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
       
   163          's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
       
   164          'y': '9', 'x': '9'}.get(m.group(0).lower())
       
   165     return letters.sub(char2number, phone)
       
   166 phone2numeric = allow_lazy(phone2numeric)
       
   167 
       
   168 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
       
   169 # Used with permission.
       
   170 def compress_string(s):
       
   171     import cStringIO, gzip
       
   172     zbuf = cStringIO.StringIO()
       
   173     zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
       
   174     zfile.write(s)
       
   175     zfile.close()
       
   176     return zbuf.getvalue()
       
   177 
       
   178 ustring_re = re.compile(u"([\u0080-\uffff])")
       
   179 
       
   180 def javascript_quote(s, quote_double_quotes=False):
       
   181 
       
   182     def fix(match):
       
   183         return r"\u%04x" % ord(match.group(1))
       
   184 
       
   185     if type(s) == str:
       
   186         s = s.decode('utf-8')
       
   187     elif type(s) != unicode:
       
   188         raise TypeError, s
       
   189     s = s.replace('\\', '\\\\')
       
   190     s = s.replace('\r', '\\r')
       
   191     s = s.replace('\n', '\\n')
       
   192     s = s.replace('\t', '\\t')
       
   193     s = s.replace("'", "\\'")
       
   194     if quote_double_quotes:
       
   195         s = s.replace('"', '&quot;')
       
   196     return str(ustring_re.sub(fix, s))
       
   197 javascript_quote = allow_lazy(javascript_quote, unicode)
       
   198 
       
   199 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
       
   200 def smart_split(text):
       
   201     """
       
   202     Generator that splits a string by spaces, leaving quoted phrases together.
       
   203     Supports both single and double quotes, and supports escaping quotes with
       
   204     backslashes. In the output, strings will keep their initial and trailing
       
   205     quote marks.
       
   206 
       
   207     >>> list(smart_split('This is "a person\'s" test.'))
       
   208     ['This', 'is', '"a person\'s"', 'test.']
       
   209     """
       
   210     text = force_unicode(text)
       
   211     for bit in smart_split_re.finditer(text):
       
   212         bit = bit.group(0)
       
   213         if bit[0] == '"' and bit[-1] == '"':
       
   214             yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
       
   215         elif bit[0] == "'" and bit[-1] == "'":
       
   216             yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
       
   217         else:
       
   218             yield bit
       
   219 smart_split = allow_lazy(smart_split, unicode)
       
   220