Skip to content

Instantly share code, notes, and snippets.

@reee
Created March 24, 2014 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reee/9734697 to your computer and use it in GitHub Desktop.
Save reee/9734697 to your computer and use it in GitHub Desktop.
Split a Doc into words and compare with a keywords list
#!/usr/bin/env python
# coding:utf-8
import os
import jieba
# Define where we put the files
source_dictionary = "/opt/ht/source"
keywords_file = "/opt/ht/keywords-new.txt"
result_file = "/opt/ht/result.txt"
# Convert the Keywords into a list
keywords_list = []
f = file(keywords_file)
while True:
line = f.readline()
if len(line) == 0:
break
keywords_list.append(line)
# Deal With the source file
file_list = os.listdir(source_dictionary)
r_file = open(result_file, "w")
for files in file_list:
s_article = open(files).read()
s_sentence = s_article.split(u"。".encode("utf-8"))
for sentences in s_sentence:
words_list = list(jieba.cut(sentences,cut_all=False))
result = list(set(words_list) & set(keywords_list))
for words in result:
r_file.write("%s;" % words)
r_file.write("\n")
r_file.write("A New File Start")
r_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment