morkapronczay/gensim-dict-filter-extremes.py

## gensim-dict-filter-extremes.py
from gensim import corpora

# create stemmed, stopword removed corpus
# by language by doc (wiki page)
texts_bylang_byhuman = {lan:
                        {key:
                         [stemmers[lan].stem(word)
                          for word in val if not word in stopwords_bylang[lan]]
                         for key, val in texts_split[lan].items()}
                        for lan in languages}

# create dictionaries by language
dictionary_bylang_byhuman = {lan: corpora.Dictionary(texts_bylang_byhuman[lan].values()) for lan in languages}

# create sparsity thresholds for percentages
SPARSE_TRESH = [0.5, 1, 5, 10]
sparse_perc = {k: math.ceil(k * len(texts['en'].keys()) / 100) for k in SPARSE_TRESH}

# remove sparse tokens
filtered_dicts_sparse = {}
for lan in languages:
    filtered_dicts_sparse[lan] = {}
    for k, v in sparse_perc.items():
      # the method effects the dictionary itself!
      # so copy, if you want to keep the initial
      filtered_dicts_sparse[lan][k] = copy.deepcopy(dictionary_bylang_byhuman[lan])
      filtered_dicts_sparse[lan][k].filter_extremes(no_below=v, no_above=1, keep_n=len(dictionary_bylang_byhuman[lan]))
	from gensim import corpora

	# create stemmed, stopword removed corpus
	# by language by doc (wiki page)
	texts_bylang_byhuman = {lan:
	{key:
	[stemmers[lan].stem(word)
	for word in val if not word in stopwords_bylang[lan]]
	for key, val in texts_split[lan].items()}
	for lan in languages}

	# create dictionaries by language
	dictionary_bylang_byhuman = {lan: corpora.Dictionary(texts_bylang_byhuman[lan].values()) for lan in languages}

	# create sparsity thresholds for percentages
	SPARSE_TRESH = [0.5, 1, 5, 10]
	sparse_perc = {k: math.ceil(k * len(texts['en'].keys()) / 100) for k in SPARSE_TRESH}

	# remove sparse tokens
	filtered_dicts_sparse = {}
	for lan in languages:
	filtered_dicts_sparse[lan] = {}
	for k, v in sparse_perc.items():
	# the method effects the dictionary itself!
	# so copy, if you want to keep the initial
	filtered_dicts_sparse[lan][k] = copy.deepcopy(dictionary_bylang_byhuman[lan])
	filtered_dicts_sparse[lan][k].filter_extremes(no_below=v, no_above=1, keep_n=len(dictionary_bylang_byhuman[lan]))