Skip to content

Instantly share code, notes, and snippets.

@kenttw
Last active October 31, 2015 08:45
Show Gist options
  • Save kenttw/c68b8f19b5a5adb0258c to your computer and use it in GitHub Desktop.
Save kenttw/c68b8f19b5a5adb0258c to your computer and use it in GitHub Desktop.
def cuttext(text):
stop_words = stop_sc.value
import jieba
from jieba import analyse
jieba.load_userdict("../data/new.dict_all")
bag_word = dict()
for word in jieba.cut(text,cut_all=False):
if word in stop_words : continue
if len(word) == 1 and word != '$' : continue
else:
if word in bag_word :
bag_word[word] = bag_word[word] + 1
else :
bag_word[word] = 1
return bag_word
def analysis_content(content):
print '*',
from bs4 import BeautifulSoup
soup = BeautifulSoup(content)
# text_len = len(soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t',''))
cuts = cuttext(soup.getText())
return cuts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment