author | Sverre Rabbelier <srabbelier@gmail.com> |
Sat, 21 Mar 2009 13:07:12 +0000 | |
changeset 1978 | a8810c038500 |
parent 109 | 620f9b141567 |
permissions | -rwxr-xr-x |
109
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
1 |
# Performance note: I benchmarked this code using a set instead of |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
2 |
# a list for the stopwords and was surprised to find that the list |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
3 |
# performed /better/ than the set - maybe because it's only a small |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
4 |
# list. |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
5 |
|
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
6 |
stopwords = ''' |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
7 |
i |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
8 |
a |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
9 |
an |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
10 |
are |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
11 |
as |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
12 |
at |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
13 |
be |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
14 |
by |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
15 |
for |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
16 |
from |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
17 |
how |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
18 |
in |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
19 |
is |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
20 |
it |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
21 |
of |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
22 |
on |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
23 |
or |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
24 |
that |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
25 |
the |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
26 |
this |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
27 |
to |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
28 |
was |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
29 |
what |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
30 |
when |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
31 |
where |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
32 |
'''.split() |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
33 |
|
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
34 |
def strip_stopwords(sentence): |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
35 |
"Removes stopwords - also normalizes whitespace" |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
36 |
words = sentence.split() |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
37 |
sentence = [] |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
38 |
for word in words: |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
39 |
if word.lower() not in stopwords: |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
40 |
sentence.append(word) |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
41 |
return ' '.join(sentence) |
620f9b141567
Load ../../google_appengine into trunk/thirdparty/google_appengine.
Todd Larsen <tlarsen@google.com>
parents:
diff
changeset
|
42 |