maxandron/stopword_speed.py

## stopword_speed.py
import re
from collections import Counter

from nltk.corpus import stopwords

# test1 - Common example
stops = stopwords.words("english")

# test2 - Using regex
pattern = re.compile(r"\b(" + r"|".join(stops) + r")\b\s*")

# test3 - Using counter
stopwords_counter = Counter(stops)

# test4 - Using dict
stopwords_dict = {word: 1 for word in stops}


def test1(text):
    return " ".join([word for word in text.split() if word not in stops])


def test2(text):
    return pattern.sub("", text)


def test3(text):
    return " ".join([word for word in text.split() if word not in stopwords_counter])


def test4(text):
    return " ".join([word for word in text.split() if word not in stopwords_dict])


def test5(text):
    new = ""
    for word in text.split():
        if word not in stopwords_dict:
            new += word
    return new


body = "Maybe he believes that bitcoin will come down in price in the future?"
for i in range(1000000):
    test1(body)
    test2(body)
    test3(body)
    test4(body)
    test5(body)

## test.sh
python -m cProfile -s cumulative speed.py | grep test
	import re
	from collections import Counter

	from nltk.corpus import stopwords

	# test1 - Common example
	stops = stopwords.words("english")

	# test2 - Using regex
	pattern = re.compile(r"\b(" + r"\|".join(stops) + r")\b\s*")

	# test3 - Using counter
	stopwords_counter = Counter(stops)

	# test4 - Using dict
	stopwords_dict = {word: 1 for word in stops}


	def test1(text):
	return " ".join([word for word in text.split() if word not in stops])


	def test2(text):
	return pattern.sub("", text)


	def test3(text):
	return " ".join([word for word in text.split() if word not in stopwords_counter])


	def test4(text):
	return " ".join([word for word in text.split() if word not in stopwords_dict])


	def test5(text):
	new = ""
	for word in text.split():
	if word not in stopwords_dict:
	new += word
	return new


	body = "Maybe he believes that bitcoin will come down in price in the future?"
	for i in range(1000000):
	test1(body)
	test2(body)
	test3(body)
	test4(body)
	test5(body)