Chestermozhao/_segment.py

## _segment.py
import re
from pyhanlp import *

words_count_dict = {}
skip_word_pat = re.compile("文章|https|http|html|JPTT|bbs|標題|ptt|女孩|八卦|版|WomenTalk|批踢踢")


def _segment(source):
    for term in HanLP.segment(source):
        pos = term.nature.__str__()
        word = term.word
        if pos.startswith("n"):
            if skip_word_pat.search(word):
                continue
            if word in words_count_dict:
                words_count_dict[word] += 1
            else:
                words_count_dict[word] = 1
	import re
	from pyhanlp import *

	words_count_dict = {}
	skip_word_pat = re.compile("文章\|https\|http\|html\|JPTT\|bbs\|標題\|ptt\|女孩\|八卦\|版\|WomenTalk\|批踢踢")


	def _segment(source):
	for term in HanLP.segment(source):
	pos = term.nature.__str__()
	word = term.word
	if pos.startswith("n"):
	if skip_word_pat.search(word):
	continue
	if word in words_count_dict:
	words_count_dict[word] += 1
	else:
	words_count_dict[word] = 1