Last active
February 13, 2022 16:57
-
-
Save maxandron/3c276924242e7d29d9cf980da0a8a682 to your computer and use it in GitHub Desktop.
Stop words removal speed test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import Counter | |
from nltk.corpus import stopwords | |
# test1 - Common example | |
stops = stopwords.words("english") | |
# test2 - Using regex | |
pattern = re.compile(r"\b(" + r"|".join(stops) + r")\b\s*") | |
# test3 - Using counter | |
stopwords_counter = Counter(stops) | |
# test4 - Using dict | |
stopwords_dict = {word: 1 for word in stops} | |
def test1(text): | |
return " ".join([word for word in text.split() if word not in stops]) | |
def test2(text): | |
return pattern.sub("", text) | |
def test3(text): | |
return " ".join([word for word in text.split() if word not in stopwords_counter]) | |
def test4(text): | |
return " ".join([word for word in text.split() if word not in stopwords_dict]) | |
def test5(text): | |
new = "" | |
for word in text.split(): | |
if word not in stopwords_dict: | |
new += word | |
return new | |
body = "Maybe he believes that bitcoin will come down in price in the future?" | |
for i in range(1000000): | |
test1(body) | |
test2(body) | |
test3(body) | |
test4(body) | |
test5(body) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python -m cProfile -s cumulative speed.py | grep test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
1000000 0.816 0.000 21.049 0.000 speed2.py:19(test1)
1000000 0.344 0.000 11.616 0.000 speed2.py:23(test2)
1000000 0.812 0.000 3.059 0.000 speed2.py:27(test3)
1000000 0.775 0.000 2.749 0.000 speed2.py:31(test4)
1000000 1.681 0.000 2.246 0.000 speed2.py:35(test5)
Using just a dict without list comprehension is the fastest solution