Skip to content

Instantly share code, notes, and snippets.

@James-McNeill
Created July 16, 2021 16:04
Show Gist options
  • Save James-McNeill/ae5efa794f913a0e527b5c77524ddb1b to your computer and use it in GitHub Desktop.
Save James-McNeill/ae5efa794f913a0e527b5c77524ddb1b to your computer and use it in GitHub Desktop.
# A few different options for stopwords, spacy and nltk. Lets compare
import nltk
from nltk.corpus import stopwords
# Comparison of the stop words available
print(f"NLTK : {len(stopwords.words('english'))} \n {stopwords.words('english')}")
print(f"Spacy : {len(nlp.Defaults.stop_words)} \n {nlp.Defaults.stop_words}")
# Compare the differences
nltk_set = set(stopwords.words('english'))
spacy_set = set(nlp.Defaults.stop_words)
# Union - all values
union = nltk_set.union(spacy_set)
# Intersection - seen in both sets
inter = nltk_set.intersection(spacy_set)
print(f"Seen in both : {len(inter)} \n {inter}")
# Remainder - differences between sets
nltk_extra = nltk_set - inter
spacy_extra = spacy_set - inter
print(f"Extra NLTK : {len(nltk_extra)} \n {nltk_extra}")
print(f"Extra Spacy : {len(spacy_extra)} \n {spacy_extra}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment