Last active
June 5, 2018 10:26
-
-
Save davidlenz/7c5fcbb6c505ec0d137ef57df899b63d to your computer and use it in GitHub Desktop.
Function to generate a list of stopwords from different sources.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import stop_words | |
from langdetect import detect | |
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
import ast | |
def get_all_stopwords(text_sample='This is an englisch sentence'): | |
""" Combines Stopwords for englisch, german, french, and spanish from NLTK. Further adds stopwords from the stop_words module. | |
Finally, stopwords from a text file stopwords.txt are added to come up with a large list of stopwords.""" | |
# detect language | |
lang = detect(text_sample) | |
print('DETECTED LANGUAGE : {}'.format(lang)) | |
# get nltk stopwords for common languages | |
stopwordssss = stopwords.words('german') + \ | |
stopwords.words('english') + \ | |
stopwords.words('french') + \ | |
stopwords.words('spanish') | |
# read from stopwords.txt file | |
aa = [] | |
with open('stopwords.txt', encoding='utf-8-sig') as f: | |
aa.append(f.read()) | |
stopword_dict = ast.literal_eval(aa[0]) | |
# join stop words from nltk, txt and from library stop_words | |
stopwordss = set(stopwordssss) | set(stop_words.get_stop_words(lang)) |set(stopword_dict) | |
stopwordlist = [*stopwordss] | |
return stopwordlist, lang | |
def _find_language(text): | |
if text != '': | |
return detect(text[:5000]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
['dass', 'sollen', 'sei','seien','rund', 'wurde', 'etwa', 'sowie', 'sagte', 'sollten', 'us', 'dollar', 'euro','konnte', | |
'gibt', 'zwei', 'mehr', 'neue', 'bereits', 'dabei', 'beim', 'lassen','laut', 'schon','seit', 'ab','allerdings','jahr','erste', | |
'immer','wurden','neuen', 'prozent','zufolge', 'viele', 'millionen', 'milliarden','the', 'beiden','wolle', 'bekannt', | |
'ersten', 'müssen','dafür', 'worden', 'dpa','wegen','hätten','lässt', 'berichtet','stellen','müsse','heute','geht','sagt','ja','wer', | |
'finden','per', 'innerhalb', 'zurück', 'bislang','produkte','mittels','gmbh','kunden','mitarbeiter', 'unternehmen', | |
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j','k','l','m','n','o','p','q','r','s','t', | |
"usw", "hinsichtlich", "proz", "mill", "einzelnen", "deutschen", "fast", "Anderseits", | |
"wirtschaftlichen", "verf", "hrsg", "deutschland", "etc", "Zweifellos", "letzten", "bzw", "jahres", | |
"zeigt", "Vorjahres", "proz", "dr", "gr", "einzelnen", "heft", "bzw", "vgl", "läßt", "Zahl", | |
"Jahren", "R.", "A.", "S.", "Gr", "Ga", "D.", "n.", "K.", "ltd", "wesentlich", "co", "stark", | |
"z.B.", "b.H.", "III", "J.", "dz", "mk", "jahres", "L.", "No", "A", "Bd.", "jim", "viii", "liefert", | |
"verf", "K.", "Verfassers", "D.", "Dr.", "R.", "C.", "E.", "I", "II", "III", "IV", "V", "VI", "VII", | |
"VIII", "IX", "X", "XI", | |
'u','v','w', 'x', 'y', 'z', 'erklärt', 'stand', 'vergangenen', 'jan', 'erst', 'jedoch', 'zuletzt', 'hieß', 'ane'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment