Skip to content

Instantly share code, notes, and snippets.

@Chestermozhao
Last active July 30, 2020 08:36
Show Gist options
  • Save Chestermozhao/c6a74cc98a713910fe0f3d53c0d2de66 to your computer and use it in GitHub Desktop.
Save Chestermozhao/c6a74cc98a713910fe0f3d53c0d2de66 to your computer and use it in GitHub Desktop.
import re
from pyhanlp import *
words_count_dict = {}
skip_word_pat = re.compile("文章|https|http|html|JPTT|bbs|標題|ptt|女孩|八卦|版|WomenTalk|批踢踢")
def _segment(source):
for term in HanLP.segment(source):
pos = term.nature.__str__()
word = term.word
if pos.startswith("n"):
if skip_word_pat.search(word):
continue
if word in words_count_dict:
words_count_dict[word] += 1
else:
words_count_dict[word] = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment