| # Performance note: I benchmarked this code using a set instead of |
| # a list for the stopwords and was surprised to find that the list |
| # performed /better/ than the set - maybe because it's only a small |
| # list. |
| |
| stopwords = ''' |
| i |
| a |
| an |
| are |
| as |
| at |
| be |
| by |
| for |
| from |
| how |
| in |
| is |
| it |
| of |
| on |
| or |
| that |
| the |
| this |
| to |
| was |
| what |
| when |
| where |
| '''.split() |
| |
| def strip_stopwords(sentence): |
| "Removes stopwords - also normalizes whitespace" |
| words = sentence.split() |
| sentence = [] |
| for word in words: |
| if word.lower() not in stopwords: |
| sentence.append(word) |
| return u' '.join(sentence) |
| |