Skip to content

Instantly share code, notes, and snippets.

@morkapronczay
Created October 15, 2019 12:10
Show Gist options
  • Save morkapronczay/93ae57d473a5093b4e35efaa8eae4780 to your computer and use it in GitHub Desktop.
Save morkapronczay/93ae57d473a5093b4e35efaa8eae4780 to your computer and use it in GitHub Desktop.
from gensim import corpora
# create stemmed, stopword removed corpus
# by language by doc (wiki page)
texts_bylang_byhuman = {lan:
{key:
[stemmers[lan].stem(word)
for word in val if not word in stopwords_bylang[lan]]
for key, val in texts_split[lan].items()}
for lan in languages}
# create dictionaries by language
dictionary_bylang_byhuman = {lan: corpora.Dictionary(texts_bylang_byhuman[lan].values()) for lan in languages}
# create sparsity thresholds for percentages
SPARSE_TRESH = [0.5, 1, 5, 10]
sparse_perc = {k: math.ceil(k * len(texts['en'].keys()) / 100) for k in SPARSE_TRESH}
# remove sparse tokens
filtered_dicts_sparse = {}
for lan in languages:
filtered_dicts_sparse[lan] = {}
for k, v in sparse_perc.items():
# the method effects the dictionary itself!
# so copy, if you want to keep the initial
filtered_dicts_sparse[lan][k] = copy.deepcopy(dictionary_bylang_byhuman[lan])
filtered_dicts_sparse[lan][k].filter_extremes(no_below=v, no_above=1, keep_n=len(dictionary_bylang_byhuman[lan]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment