app/django/utils/stopwords.py
changeset 54 03e267d67478
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/app/django/utils/stopwords.py	Fri Jul 18 18:22:23 2008 +0000
@@ -0,0 +1,42 @@
+# Performance note: I benchmarked this code using a set instead of
+# a list for the stopwords and was surprised to find that the list
+# performed /better/ than the set - maybe because it's only a small
+# list.
+
+stopwords = '''
+i
+a
+an
+are
+as
+at
+be
+by
+for
+from
+how
+in
+is
+it
+of
+on
+or
+that
+the
+this
+to
+was
+what
+when
+where
+'''.split()
+
+def strip_stopwords(sentence):
+    "Removes stopwords - also normalizes whitespace"
+    words = sentence.split()
+    sentence = []
+    for word in words:
+        if word.lower() not in stopwords:
+            sentence.append(word)
+    return u' '.join(sentence)
+