equal
deleted
inserted
replaced
|
1 # Performance note: I benchmarked this code using a set instead of |
|
2 # a list for the stopwords and was surprised to find that the list |
|
3 # performed /better/ than the set - maybe because it's only a small |
|
4 # list. |
|
5 |
|
6 stopwords = ''' |
|
7 i |
|
8 a |
|
9 an |
|
10 are |
|
11 as |
|
12 at |
|
13 be |
|
14 by |
|
15 for |
|
16 from |
|
17 how |
|
18 in |
|
19 is |
|
20 it |
|
21 of |
|
22 on |
|
23 or |
|
24 that |
|
25 the |
|
26 this |
|
27 to |
|
28 was |
|
29 what |
|
30 when |
|
31 where |
|
32 '''.split() |
|
33 |
|
34 def strip_stopwords(sentence): |
|
35 "Removes stopwords - also normalizes whitespace" |
|
36 words = sentence.split() |
|
37 sentence = [] |
|
38 for word in words: |
|
39 if word.lower() not in stopwords: |
|
40 sentence.append(word) |
|
41 return u' '.join(sentence) |
|
42 |