-
-
Save BigAN/fc67b725038ba7bc8f8c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
from itertools import izip | |
from math import sqrt | |
from itertools import islice | |
from HTMLParser import HTMLParser | |
import MongoDBConn | |
from bson import ObjectId | |
dbconn=MongoDBConn.DBConn() | |
dbconn.connect() | |
conn=dbconn.getConn() | |
fin_items=conn.scrapy.fin_items | |
conn.scrapy.createCollection | |
fin_2_items=conn.scrapy.fin_2_items | |
text_group=conn.scrapy.text_group | |
# word_dict=conn.scrapy.word_dict | |
word_dict={} | |
url_footprint={} | |
tmp_items={} | |
final_items={} | |
sim_tmp={} | |
threshold=0.3 | |
g_count=0 | |
import time | |
# fin_items.remove() | |
text_feature_dict={} | |
def jaccard_list(a, *arg): | |
f_list=[] | |
for b in arg: | |
c=a.intersection(b) | |
f_list.append(float(len(c)) / (len(a) + len(b) - len(c))) | |
return f_list | |
def jaccard(a,b): | |
c=a.intersection(b) | |
return float(len(c)) / (len(a) + len(b) - len(c)) | |
def _line2characters(line): | |
return list(line.decode("utf-8")) | |
def getwords(words): | |
''' | |
1.去除 | |
''' | |
# print text.encode("utf-8") | |
# print words | |
text=_line2characters(words) | |
fin_word_list=[] | |
thres=20 | |
tmp_t=thres | |
fin_string="" | |
for para in sorted(words.split("\n"),key=len): | |
if len(para)>100: | |
# fin_word_list.append(para) | |
# print para,len(para) | |
fin_string+=para | |
return fin_string | |
def _line2characters(line): | |
return list(line.encode("utf-8")) | |
def ngrams(words,n): | |
z=(islice(words,i,None) for i in range(n)) | |
return zip(*z) | |
def cosine_distance(a): | |
b=[1]*len(a) | |
assert len(a) == len(b) | |
ab_sum, a_sum, b_sum = 0, 0, 0 | |
for ai, bi in izip(a, b): | |
ab_sum += ai * bi | |
a_sum += ai * ai | |
b_sum += bi * bi | |
return 1 - ab_sum / sqrt(a_sum * b_sum) | |
def cut_sentence(words): | |
# print words | |
words = words.encode('utf8') | |
start = 0 | |
i = 0 | |
sents = [] | |
punt_list = ',.!?:;~,。!?:;~ ' | |
token = "" | |
for word in words: | |
if word in punt_list and token not in punt_list: | |
sents.append(words[start:i+1]) | |
start = i+1 | |
i += 1 | |
else: | |
i += 1 | |
token = list(words[start:i+2]).pop() # 取下一个字符 | |
if start < len(words): | |
sents.append(words[start:]) | |
f_sents=[] | |
for sent in sents: | |
if len(sent)/3 >= 8: | |
f_sents.append(sent) | |
return f_sents | |
def cosine_words(word1,word2): | |
''' | |
金 | |
''' | |
long_word=len(word1)>len(word2) and word1 or word2 | |
short_word=len(word1)<len(word2) and word1 or word2 | |
print long_word | |
print short_word | |
num=len(long_word)-len(short_word) | |
for i in xrange(num): | |
short_word.append(" ") | |
print short_word | |
assert len(word1)==len(word2) | |
fin=[] | |
for i in xrange(len(word1)): | |
if word1[i]==word2[i]: | |
fin.append(1) | |
else: | |
fin.append(0) | |
return fin | |
def final_distance(list1,list2): | |
title1=list1[0] | |
title2=list2[0] | |
body1=ngrams(list1[1],2) | |
body2=ngrams(list2[1],2) | |
long_sents_1=cut_sentence_new(list1[1]) | |
long_sents_2=cut_sentence_new(list2[1]) | |
t_float=cosine_distance(cosine_words(t1,t2)) | |
b_float=cosine_distance(cosine_words(body1,body2)) | |
s_float=cosine_distance(cosine_words(long_sents_1,long_sents_2)) | |
return float((4*s_float+b_float*2+1*t_float)/7) | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data() | |
def main(): | |
global threhold,fin_items,fin_2_items | |
''' | |
1.bigram分词。返回list | |
2.取长句子,返回list | |
3.组合起来。每两个计算cosine值。 | |
''' | |
''' | |
扫描库 | |
判断是否在set | |
pass | |
不在 | |
fin_set加入点 | |
计算大于0.8的 加入set | |
获得fin_set, | |
然后插入 | |
''' | |
print "start" | |
print fin_items.find().count() | |
for item in fin_items.find(): | |
print "test" | |
_id=item["_id"] | |
title=item["title"] | |
body=getwords(strip_tags(item["content"])) | |
tmp_items[_id]=[title,body] | |
sim_set=set() | |
fin_set=set() | |
for k_p,v_p in tmp_items: | |
for k,v in tmp_items: | |
if k in sim_set: | |
print k," in sim_set" | |
else: | |
d = final_distance(v,v_p) | |
if d > threhold: | |
sim_set.add(k) | |
fin_set.add(k_p) | |
fin_sim_set = sim_set - fin_set | |
fin_2_items.remove() | |
for item in fin_items.find(): | |
if str(item[_id]) in fin_sim_set: | |
print " item ",item['_id'],item['title'], " is in fin_sim_set" | |
else: | |
fin_2_items.insert(item) | |
#这里仿照思维,建立质心。 | |
def similiarity_comp(id,arg_list): | |
''' | |
input:item | |
''' | |
global sim_tmp,sim_group,index | |
#增加初始化index | |
title_simhash = Simhash(jieba.cut(arg_list[0])) | |
if index.count == 0: | |
index.add(id,title_simhash) | |
sim_tmp[id]=set() | |
else: | |
near_list=index.get_near_dups(title_simhash) | |
if near_list : | |
#如果找到临近列表,将点加入到每一个集合中。 | |
for key_id in near_list: | |
sim_tmp[key_id].add(key_id) | |
else: | |
#如果没找到在sim_tmp中创建。并且更新index | |
sim_tmp[id]=set() | |
index.add(id,title_simhash) | |
def cal_item_time(sim_set,**xargs): | |
# for i in xargs: | |
old_item_time,new_item_time,last_modified_time=None,None,0 | |
tmp_items={} | |
time_list=[] | |
for key_id in sim_set: | |
tmp_item=fin_items.find({"_id":ObjectId(key_id)},{"_spider.ctime":1})[0] | |
tmp_items[str(tmp_item["_id"])]=int(tmp_item["_spider"]["ctime"]) | |
for k,v in tmp_items.items(): | |
time_list.append(v) | |
time_list=sorted(time_list) | |
if xargs: | |
print "xargs" | |
old_item_time,new_item_time,last_modified_time=[v for k,v in xargs.items()] | |
if old_item_time < time_list[0]: | |
old_item_time = time_list[0] | |
if new_item_time > time_list[-1]: | |
new_item_time = time_list[-1] | |
last_modified_time = time.time() | |
else: | |
# print len(time_list),"len(time_list)" | |
if len(time_list)>0: | |
old_item_time=time_list[0] | |
new_item_time=time_list[-1] | |
last_modified_time=time.time() | |
else: | |
old_item_time = time.time() | |
new_item_time = time.time() | |
last_modified_time = time.time() | |
return old_item_time,new_item_time,int(last_modified_time) | |
def save_worddict(str_in_list): | |
global f | |
''' | |
结构 字符串长度 length: {"字符串": id} | |
查找 find({"length":len(str_in_list)})["字符串"] | |
先实现内存版 | |
''' | |
word_seq_list=[] | |
# if not str_in_list: | |
# pass | |
# else: | |
# for word in str_in_list: | |
# tmp_id=0 | |
# arg_find=str(len(word))+"."+word | |
# word_find=[x for x in word_dict.find({},{arg_find:1})] | |
# print word_find | |
# tmp_id=word_find[word] | |
# if word_find.hasattr(word): | |
# tmp_id=word_find[word] | |
# else: | |
# tmp_id=len(word_dict[len(word)])+1 | |
# word_dict.insert({str(len(word)),{word:tmp_id}}) | |
# word_seq_list.append([len(word),tmp_id]) | |
global word_dict | |
#长度 + list版本 | |
# if not str_in_list: | |
# pass | |
# else: | |
# for word in str_in_list: | |
# tmp_id=0 | |
# length=str(len(word)) | |
# if length in word_dict: | |
# if word in word_dict[length]: | |
# tmp_id=word_dict[length][word] | |
# else: | |
# word_dict[length]={} | |
# length_length = len(word_dict[length]) | |
# tmp_id = word_dict[length][word] = length_length+1 | |
# else: | |
# word_dict[length]={} | |
# word_dict[length][word]=1 | |
# tmp_id=1 | |
# word_seq_list.append([length,tmp_id]) | |
global g_count | |
if not str_in_list: | |
pass | |
else: | |
tmp_id=0 | |
for word in str_in_list: | |
if word in word_dict: | |
tmp_id=word_dict[word] | |
else: | |
tmp_id=word_dict[word]=g_count | |
g_count+=1 | |
word_seq_list.append(tmp_id) | |
return word_seq_list | |
# print "haha" | |
def get_near_dups(text_feature,sim_tmp): | |
f_list=[] | |
if len(sim_tmp)==0: | |
return None | |
for key in sim_tmp.keys(): | |
text_feature_key=text_feature_dict[key] | |
jac_dis=jaccard(set(text_feature),set(text_feature_key)) | |
print "jac_dis",jac_dis | |
if jac_dis>threshold: | |
f_list.append(key) | |
return f_list | |
def cal_text(key_main,sim_set): | |
f_list = [] | |
text_tmp=None | |
if len(sim_set) != 0: | |
text=[x for x in fin_items.find({"_id":ObjectId(key_main)},{"title":1,"content":1})][0] | |
text_tmp={"title":text["title"],"content":getwords(strip_tags(text["content"]))} | |
f_list.append(text_tmp) | |
for key_id in sim_set: | |
print key_id | |
# print [x for x in fin_items.find({"_id":ObjectId(key_id)},{"title":1,"content":1})] | |
text=[x for x in fin_items.find({"_id":ObjectId(key_id)},{"title":1,"content":1})][0] | |
# print text | |
text_tmp={"title":text["title"],"content":getwords(strip_tags(text["content"]))} | |
f_list.append(text_tmp) | |
return f_list | |
def test_process(): | |
''' | |
文本id化 | |
算jaccard距离 | |
根据此聚类 | |
''' | |
global sim_tmp,text_group,fin_list,text_feature_dict | |
# i=1 | |
for item in fin_items.find().limit(100): | |
item_id=str(item["_id"]) | |
title = item["title"] | |
body = getwords(strip_tags(item["content"])) | |
# print "body",body | |
# print "strip_tags over" | |
bigram = ngrams(body,2) | |
long_sentences = cut_sentence(body) | |
# print "cut_sentence over" | |
# print "title",title | |
# print "bigram",bigram | |
# print "long_sentences",long_sentences | |
# print bigram | |
l_text_feature_text =sum([[title],bigram,long_sentences],[]) | |
l_text_feature_id = save_worddict(l_text_feature_text) | |
text_feature_dict[item_id]=l_text_feature_id | |
# print "text_feature_dict",text_feature_dict | |
for item_id,text_feature in text_feature_dict.items(): | |
near_list = get_near_dups(text_feature,sim_tmp) | |
if near_list: | |
for key_id in near_list: | |
sim_tmp[key_id].add(key_id) | |
else: | |
sim_tmp[item_id]=set() | |
text_group.remove() | |
for key_id,sim_set in sim_tmp.items(): | |
old_item_time,new_item_time,last_modified_time=cal_item_time(sim_set) | |
text_list=cal_text(key_id,sim_set) | |
text_group.insert({"key_id":key_id,"sim_items":list(sim_set),"sim_text":text_list,"center":"","last_modified_time":last_modified_time,"old_item_time":old_item_time,"new_item_time":new_item_time}) | |
# fin_items.insert(items.find({"_id":ObjectId(key_id)})) | |
# text = | |
# (html)*/ | |
# getwords(text) | |
# title1="房祖名被正式起诉 社会影响大无缓刑可能" | |
# title2="房祖名被正式起诉 社会影响大无缓刑可能" | |
# body1="新浪娱乐讯 据台湾媒体报道,房祖名[微博](本名陈祖明)因“容留他人吸毒罪”,22日遭北京市东城区人民检察院提起公诉。内地资深刑法律师指称,他是公众人物,社会观感影响大,因此不仅没有“缓刑”机会,起诉后也不会让他交保,可能重判2年以上有期徒刑,最快农历年前就锒铛入狱。房祖名经纪人Steven昨说,他也是看到媒体报道才知道,他还没和律师、以及成龙[微博]夫妻联络上。今年8月房祖名、柯震东[微博]吸毒被捕,在两岸三地引发轩然大波;其中,房祖名因涉嫌容留他人吸毒犯罪,自8月14日起被警方关押,9月17日被检方刑事拘留至今。历经近4个月调查,北京东城区检察院正式以“容留他人吸毒罪”,向法院提起公诉。" | |
# body2="新浪娱乐讯 据台湾媒体报道,房祖名[微博](本名陈祖明)因“容留他人吸毒罪”,22日遭北京市东城区人民检察院提起公诉。内地资深刑法律师指称,他是公众人物,社会观感影响大,因此不仅没有“缓刑”机会,起诉后也不会让他交保,可能重判2年以上有期徒刑,最快农历年前就锒铛入狱。房祖名经纪人Steven昨说,他也是看到媒体报道才知道,他还没和律师、以及成龙[微博]夫妻联络上。 今年8月房祖名、柯震东[微博]吸毒被捕,在两岸三地引发轩然大波;其中,房祖名因涉嫌容留他人吸毒犯罪,自8月14日起被警方关押,9月17日被检方刑事拘留至今。历经近4个月调查,北京东城区检察院正式以“容留他人吸毒罪”,向法院提起公诉。" | |
# #1.bigram分词。返回list | |
# bi_list_1=ngrams(title1,2) | |
# bi_list_2=ngrams(title2,2) | |
# #2.取长句子,返回list | |
# long_sents_1=cut_sentence_new(body1) | |
# long_sents_2=cut_sentence_new(body2) | |
# words1=sum([bi_list_1,long_sents_1],[]) | |
# words2=sum([bi_list_1,long_sents_1],[]) | |
# t1=["你好","好么","好啊"] | |
# t2=["你好"] | |
# print cosine_distance(cosine_words(t1,t2)) | |
# test="nihaoa,zuotiannidouganleshenme。nizuijinhaome。" | |
# a=[0,1,0,0,0,1] | |
# b=[1,1,1,1,1,1] | |
# print cosine_distance(a,b) | |
# print cut_sentence_new(test) | |
# print _line2characters(a) | |
# print len("你好啊")/3 | |
# for i in xrangea: | |
# print i | |
# print ngrams(a,2) | |
if __name__=="__main__": | |
test_process() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment