app/django/utils/stopwords.py
author Mario Ferraro <fadinlight@gmail.com>
Sun, 15 Nov 2009 22:12:20 +0100
changeset 3093 d1be59b6b627
parent 54 03e267d67478
permissions -rw-r--r--
GMaps related JS changed to use new google namespace. Google is going to change permanently in the future the way to load its services, so better stay safe. Also this commit shows uses of the new melange.js module. Fixes Issue 634.

# Performance note: I benchmarked this code using a set instead of
# a list for the stopwords and was surprised to find that the list
# performed /better/ than the set - maybe because it's only a small
# list.

stopwords = '''
i
a
an
are
as
at
be
by
for
from
how
in
is
it
of
on
or
that
the
this
to
was
what
when
where
'''.split()

def strip_stopwords(sentence):
    "Removes stopwords - also normalizes whitespace"
    words = sentence.split()
    sentence = []
    for word in words:
        if word.lower() not in stopwords:
            sentence.append(word)
    return u' '.join(sentence)