Skip to content

Instantly share code, notes, and snippets.

@davidmezzetti
Created January 29, 2020 22:36
Show Gist options
  • Save davidmezzetti/c07831b45895710cda752c4bd914d30b to your computer and use it in GitHub Desktop.
Save davidmezzetti/c07831b45895710cda752c4bd914d30b to your computer and use it in GitHub Desktop.
import re
import sys
from nltk.stem.porter import PorterStemmer
class Tokenizer(object):
# Standard stop words used by Lucene/Elasticsearch
STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"])
@staticmethod
def tokenize(text, toLower=True, removeStopWords=True, stem=True, requireAlpha=True):
if toLower:
text = text.lower()
tokens = text.split()
if removeStopWords:
tokens = [x for x in tokens if x.lower() not in Tokenizer.STOP_WORDS]
if stem:
stemmer = PorterStemmer()
tokens = [stemmer.stem(x) for x in tokens]
if requireAlpha:
tokens = [x for x in tokens if re.match(r"^[a-z]+$", x.lower())]
return tokens
if __name__ == "__main__":
print(Tokenizer.tokenize(sys.argv[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment