Skip to content

Instantly share code, notes, and snippets.

@rgtjf
Last active July 12, 2017 02:24
Show Gist options
  • Save rgtjf/7ca1e4504191ca77c12e74098c854faa to your computer and use it in GitHub Desktop.
Save rgtjf/7ca1e4504191ca77c12e74098c854faa to your computer and use it in GitHub Desktop.
word frequency (tf, idf, stopwods)
def tf(sentence_list, min_cnt=1, max_cnt=None):
doc_num = 0
word_list = []
for sequence in sentence_list:
word_list += sequence
doc_num += 1
word_count = Counter()
for word in word_list:
word_count[word] += 1
if max_cnt is None:
good_keys = [v for v in word_count.keys() if word_count[v] >= min_cnt]
else:
good_keys = [v for v in word_count.keys() if word_count[v] >= min_cnt and word_count[v] <= max_cnt]
tf_dict = {}
for key in good_keys:
tf_dict[key] = word_count[key]
return tf_dict
def idf(sentence_list, min_cnt=1, max_cnt=None):
doc_num = 0
word_list = []
for sequence in sentence_list:
word_list += sequence
doc_num += 1
word_count = Counter()
for word in word_list:
word_count[word] += 1
if max_cnt is None:
good_keys = [v for v in word_count.keys() if word_count[v] >= min_cnt]
else:
good_keys = [v for v in word_count.keys() if word_count[v] >= min_cnt and word_count[v] <= max_cnt]
tf_dict = {}
for key in good_keys:
tf_dict[key] = word_count[key]
for key in idf_dict.keys():
idf_dict[key] = math.log(float(doc_num) / float(tf_dict[key])) / math.log(10)
return idf_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment