app/django/utils/stopwords.py
author Pawel Solyga <Pawel.Solyga@gmail.com>
Wed, 15 Oct 2008 14:06:33 +0000
changeset 338 0d78f41dde9b
parent 54 03e267d67478
permissions -rw-r--r--
Show "Created by" read-only field in Document Edit view. Update size of TinyMCE widget in Document Edit/Create views. Change user property name to founder in Document model and update files according to this change (now founder is used in Group and Document models). Remove not used variables and imports in views/site/docs/edit.py. Refactor EditForm and CreateForm in views/site/docs/edit.py so that EditForm inherits from CreateForm and just extends it. Patch by: Pawel Solyga Review by: to-be-reviewed

# Performance note: I benchmarked this code using a set instead of
# a list for the stopwords and was surprised to find that the list
# performed /better/ than the set - maybe because it's only a small
# list.

stopwords = '''
i
a
an
are
as
at
be
by
for
from
how
in
is
it
of
on
or
that
the
this
to
was
what
when
where
'''.split()

def strip_stopwords(sentence):
    "Removes stopwords - also normalizes whitespace"
    words = sentence.split()
    sentence = []
    for word in words:
        if word.lower() not in stopwords:
            sentence.append(word)
    return u' '.join(sentence)