app/django/utils/stopwords.py
changeset 54 03e267d67478
equal deleted inserted replaced
53:57b4279d8c4e 54:03e267d67478
       
     1 # Performance note: I benchmarked this code using a set instead of
       
     2 # a list for the stopwords and was surprised to find that the list
       
     3 # performed /better/ than the set - maybe because it's only a small
       
     4 # list.
       
     5 
       
     6 stopwords = '''
       
     7 i
       
     8 a
       
     9 an
       
    10 are
       
    11 as
       
    12 at
       
    13 be
       
    14 by
       
    15 for
       
    16 from
       
    17 how
       
    18 in
       
    19 is
       
    20 it
       
    21 of
       
    22 on
       
    23 or
       
    24 that
       
    25 the
       
    26 this
       
    27 to
       
    28 was
       
    29 what
       
    30 when
       
    31 where
       
    32 '''.split()
       
    33 
       
    34 def strip_stopwords(sentence):
       
    35     "Removes stopwords - also normalizes whitespace"
       
    36     words = sentence.split()
       
    37     sentence = []
       
    38     for word in words:
       
    39         if word.lower() not in stopwords:
       
    40             sentence.append(word)
       
    41     return u' '.join(sentence)
       
    42