davidmezzetti/tokenizer.py

## tokenizer.py
import re
import sys

from nltk.stem.porter import PorterStemmer

class Tokenizer(object):
  # Standard stop words used by Lucene/Elasticsearch
  STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
                    "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
                    "they", "this", "to", "was", "will", "with"])

  @staticmethod
  def tokenize(text, toLower=True, removeStopWords=True, stem=True, requireAlpha=True):
    if toLower:
      text = text.lower()

    tokens = text.split()

    if removeStopWords:
      tokens = [x for x in tokens if x.lower() not in Tokenizer.STOP_WORDS]

    if stem:
      stemmer = PorterStemmer()
      tokens = [stemmer.stem(x) for x in tokens]

    if requireAlpha:
      tokens = [x for x in tokens if re.match(r"^[a-z]+$", x.lower())]

    return tokens

if __name__ == "__main__":
  print(Tokenizer.tokenize(sys.argv[1]))
	import re
	import sys

	from nltk.stem.porter import PorterStemmer

	class Tokenizer(object):
	# Standard stop words used by Lucene/Elasticsearch
	STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
	"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
	"they", "this", "to", "was", "will", "with"])

	@staticmethod
	def tokenize(text, toLower=True, removeStopWords=True, stem=True, requireAlpha=True):
	if toLower:
	text = text.lower()

	tokens = text.split()

	if removeStopWords:
	tokens = [x for x in tokens if x.lower() not in Tokenizer.STOP_WORDS]

	if stem:
	stemmer = PorterStemmer()
	tokens = [stemmer.stem(x) for x in tokens]

	if requireAlpha:
	tokens = [x for x in tokens if re.match(r"^[a-z]+$", x.lower())]

	return tokens

	if __name__ == "__main__":
	print(Tokenizer.tokenize(sys.argv[1]))