app/django/utils/stopwords.py
author Pawel Solyga <Pawel.Solyga@gmail.com>
Tue, 20 Jan 2009 21:00:55 +0000
changeset 848 07fd6a603c24
parent 54 03e267d67478
permissions -rw-r--r--
Disable R0801 messages in pylintrc. R0801 messages indicates that a set of similar lines has been detected among multiple file. This usually means that the code should be refactored to avoid this duplication but in our case it's useless since it shows a lot of imports code or authors. Patch by: Pawel Solyga Review by: to-be-reviewed

# Performance note: I benchmarked this code using a set instead of
# a list for the stopwords and was surprised to find that the list
# performed /better/ than the set - maybe because it's only a small
# list.

stopwords = '''
i
a
an
are
as
at
be
by
for
from
how
in
is
it
of
on
or
that
the
this
to
was
what
when
where
'''.split()

def strip_stopwords(sentence):
    "Removes stopwords - also normalizes whitespace"
    words = sentence.split()
    sentence = []
    for word in words:
        if word.lower() not in stopwords:
            sentence.append(word)
    return u' '.join(sentence)