app/django/utils/stopwords.py
author Matthew Wilkes <matthew@matthewwilkes.co.uk>
Fri, 15 May 2009 15:29:41 +0200
changeset 2314 0a0e603215d7
parent 54 03e267d67478
permissions -rw-r--r--
Include required antl3 library and check if datastore is available The datastore is checked for availability before requesting it to be cleared. This is because gaeftest uses its own method for ensuring no leakage of data by providing a temporary file as the backend. Reviewed by: Sverre Rabbelier

# Performance note: I benchmarked this code using a set instead of
# a list for the stopwords and was surprised to find that the list
# performed /better/ than the set - maybe because it's only a small
# list.

stopwords = '''
i
a
an
are
as
at
be
by
for
from
how
in
is
it
of
on
or
that
the
this
to
was
what
when
where
'''.split()

def strip_stopwords(sentence):
    "Removes stopwords - also normalizes whitespace"
    words = sentence.split()
    sentence = []
    for word in words:
        if word.lower() not in stopwords:
            sentence.append(word)
    return u' '.join(sentence)