Skip to content

Instantly share code, notes, and snippets.

@zxygentoo
Last active April 8, 2022 12:20
Show Gist options
  • Save zxygentoo/aab3c7aea570df68ddc874e19890f777 to your computer and use it in GitHub Desktop.
Save zxygentoo/aab3c7aea570df68ddc874e19890f777 to your computer and use it in GitHub Desktop.
使用 jieba 分词库的简单中文词频统计
#! python3
# -*- coding: utf-8 -*-
from collections import Counter
import sys
import os
import codecs
import getopt
import jieba
def help_then_exit():
"""Print help message then exit."""
print('freq.py -i <inputfile>')
sys.exit()
def get_filename(argv):
"""Get input filename from command line args."""
try:
opts, args = getopt.getopt(argv,"hi:", ["input_file="])
except getopt.GetoptError:
help_then_exit()
else:
for opt, arg in opts:
if opt in ("-i", "--input_file"):
return arg
else:
help_then_exit()
else:
help_then_exit()
def get_text(filename):
"""Open, read and return file content."""
with codecs.open(filename, 'r', 'utf8') as f:
return f.read()
def segment_words(text):
"""Segment text string into word list."""
return jieba.cut(text)
def calculate_threshold(text_length):
"""Calculate useful word occurrence threshold from text length."""
return 5 if text_length < 100000 else int(text_length / 10000)
def count_words(word_list):
"""Count occurrence for each word in word list."""
c = Counter()
for x in word_list:
c[x] = c[x] + 1 if len(x) > 1 else c[x]
return c
def filter_word_dict_on_threshold(word_dict, threshold):
"""Filter word_dict where occurrences are greater than threshold."""
return [
(word, count)
for word, count in word_dict.most_common()
if count >= threshold
]
def print_result(word_dict):
"""Print result in rank/word/occurrence format."""
for index, (word, count) in enumerate(word_dict):
print('%d\t\t%s\t\t%d' % (index + 1, word, count))
def main(argv):
"""Main function."""
text = get_text(get_filename(argv))
print_result(
filter_word_dict_on_threshold(
count_words(segment_words(text)),
calculate_threshold(len(text))
)
)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment