author | Lennard de Rijk <ljvderijk@gmail.com> |
Sun, 25 Jan 2009 12:28:26 +0000 | |
changeset 980 | 3f3f7ec251bf |
parent 54 | 03e267d67478 |
permissions | -rw-r--r-- |
# Performance note: I benchmarked this code using a set instead of # a list for the stopwords and was surprised to find that the list # performed /better/ than the set - maybe because it's only a small # list. stopwords = ''' i a an are as at be by for from how in is it of on or that the this to was what when where '''.split() def strip_stopwords(sentence): "Removes stopwords - also normalizes whitespace" words = sentence.split() sentence = [] for word in words: if word.lower() not in stopwords: sentence.append(word) return u' '.join(sentence)