Skip to content

Instantly share code, notes, and snippets.

@davidlenz
Last active June 5, 2018 10:26
Show Gist options
  • Save davidlenz/7c5fcbb6c505ec0d137ef57df899b63d to your computer and use it in GitHub Desktop.
Save davidlenz/7c5fcbb6c505ec0d137ef57df899b63d to your computer and use it in GitHub Desktop.
Function to generate a list of stopwords from different sources.
import stop_words
from langdetect import detect
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import ast
def get_all_stopwords(text_sample='This is an englisch sentence'):
""" Combines Stopwords for englisch, german, french, and spanish from NLTK. Further adds stopwords from the stop_words module.
Finally, stopwords from a text file stopwords.txt are added to come up with a large list of stopwords."""
# detect language
lang = detect(text_sample)
print('DETECTED LANGUAGE : {}'.format(lang))
# get nltk stopwords for common languages
stopwordssss = stopwords.words('german') + \
stopwords.words('english') + \
stopwords.words('french') + \
stopwords.words('spanish')
# read from stopwords.txt file
aa = []
with open('stopwords.txt', encoding='utf-8-sig') as f:
aa.append(f.read())
stopword_dict = ast.literal_eval(aa[0])
# join stop words from nltk, txt and from library stop_words
stopwordss = set(stopwordssss) | set(stop_words.get_stop_words(lang)) |set(stopword_dict)
stopwordlist = [*stopwordss]
return stopwordlist, lang
def _find_language(text):
if text != '':
return detect(text[:5000])
['dass', 'sollen', 'sei','seien','rund', 'wurde', 'etwa', 'sowie', 'sagte', 'sollten', 'us', 'dollar', 'euro','konnte',
'gibt', 'zwei', 'mehr', 'neue', 'bereits', 'dabei', 'beim', 'lassen','laut', 'schon','seit', 'ab','allerdings','jahr','erste',
'immer','wurden','neuen', 'prozent','zufolge', 'viele', 'millionen', 'milliarden','the', 'beiden','wolle', 'bekannt',
'ersten', 'müssen','dafür', 'worden', 'dpa','wegen','hätten','lässt', 'berichtet','stellen','müsse','heute','geht','sagt','ja','wer',
'finden','per', 'innerhalb', 'zurück', 'bislang','produkte','mittels','gmbh','kunden','mitarbeiter', 'unternehmen',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j','k','l','m','n','o','p','q','r','s','t',
"usw", "hinsichtlich", "proz", "mill", "einzelnen", "deutschen", "fast", "Anderseits",
"wirtschaftlichen", "verf", "hrsg", "deutschland", "etc", "Zweifellos", "letzten", "bzw", "jahres",
"zeigt", "Vorjahres", "proz", "dr", "gr", "einzelnen", "heft", "bzw", "vgl", "läßt", "Zahl",
"Jahren", "R.", "A.", "S.", "Gr", "Ga", "D.", "n.", "K.", "ltd", "wesentlich", "co", "stark",
"z.B.", "b.H.", "III", "J.", "dz", "mk", "jahres", "L.", "No", "A", "Bd.", "jim", "viii", "liefert",
"verf", "K.", "Verfassers", "D.", "Dr.", "R.", "C.", "E.", "I", "II", "III", "IV", "V", "VI", "VII",
"VIII", "IX", "X", "XI",
'u','v','w', 'x', 'y', 'z', 'erklärt', 'stand', 'vergangenen', 'jan', 'erst', 'jedoch', 'zuletzt', 'hieß', 'ane']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment