Skip to content

Instantly share code, notes, and snippets.

@BigAN
Created December 26, 2014 02:37
Show Gist options
  • Save BigAN/fc67b725038ba7bc8f8c to your computer and use it in GitHub Desktop.
Save BigAN/fc67b725038ba7bc8f8c to your computer and use it in GitHub Desktop.
#-*- coding:utf-8 -*-
from itertools import izip
from math import sqrt
from itertools import islice
from HTMLParser import HTMLParser
import MongoDBConn
from bson import ObjectId
dbconn=MongoDBConn.DBConn()
dbconn.connect()
conn=dbconn.getConn()
fin_items=conn.scrapy.fin_items
conn.scrapy.createCollection
fin_2_items=conn.scrapy.fin_2_items
text_group=conn.scrapy.text_group
# word_dict=conn.scrapy.word_dict
word_dict={}
url_footprint={}
tmp_items={}
final_items={}
sim_tmp={}
threshold=0.3
g_count=0
import time
# fin_items.remove()
text_feature_dict={}
def jaccard_list(a, *arg):
f_list=[]
for b in arg:
c=a.intersection(b)
f_list.append(float(len(c)) / (len(a) + len(b) - len(c)))
return f_list
def jaccard(a,b):
c=a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
def _line2characters(line):
return list(line.decode("utf-8"))
def getwords(words):
'''
1.去除
'''
# print text.encode("utf-8")
# print words
text=_line2characters(words)
fin_word_list=[]
thres=20
tmp_t=thres
fin_string=""
for para in sorted(words.split("\n"),key=len):
if len(para)>100:
# fin_word_list.append(para)
# print para,len(para)
fin_string+=para
return fin_string
def _line2characters(line):
return list(line.encode("utf-8"))
def ngrams(words,n):
z=(islice(words,i,None) for i in range(n))
return zip(*z)
def cosine_distance(a):
b=[1]*len(a)
assert len(a) == len(b)
ab_sum, a_sum, b_sum = 0, 0, 0
for ai, bi in izip(a, b):
ab_sum += ai * bi
a_sum += ai * ai
b_sum += bi * bi
return 1 - ab_sum / sqrt(a_sum * b_sum)
def cut_sentence(words):
# print words
words = words.encode('utf8')
start = 0
i = 0
sents = []
punt_list = ',.!?:;~,。!?:;~ '
token = ""
for word in words:
if word in punt_list and token not in punt_list:
sents.append(words[start:i+1])
start = i+1
i += 1
else:
i += 1
token = list(words[start:i+2]).pop() # 取下一个字符
if start < len(words):
sents.append(words[start:])
f_sents=[]
for sent in sents:
if len(sent)/3 >= 8:
f_sents.append(sent)
return f_sents
def cosine_words(word1,word2):
'''
'''
long_word=len(word1)>len(word2) and word1 or word2
short_word=len(word1)<len(word2) and word1 or word2
print long_word
print short_word
num=len(long_word)-len(short_word)
for i in xrange(num):
short_word.append(" ")
print short_word
assert len(word1)==len(word2)
fin=[]
for i in xrange(len(word1)):
if word1[i]==word2[i]:
fin.append(1)
else:
fin.append(0)
return fin
def final_distance(list1,list2):
title1=list1[0]
title2=list2[0]
body1=ngrams(list1[1],2)
body2=ngrams(list2[1],2)
long_sents_1=cut_sentence_new(list1[1])
long_sents_2=cut_sentence_new(list2[1])
t_float=cosine_distance(cosine_words(t1,t2))
b_float=cosine_distance(cosine_words(body1,body2))
s_float=cosine_distance(cosine_words(long_sents_1,long_sents_2))
return float((4*s_float+b_float*2+1*t_float)/7)
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def main():
global threhold,fin_items,fin_2_items
'''
1.bigram分词。返回list
2.取长句子,返回list
3.组合起来。每两个计算cosine值。
'''
'''
扫描库
判断是否在set
pass
不在
fin_set加入点
计算大于0.8的 加入set
获得fin_set,
然后插入
'''
print "start"
print fin_items.find().count()
for item in fin_items.find():
print "test"
_id=item["_id"]
title=item["title"]
body=getwords(strip_tags(item["content"]))
tmp_items[_id]=[title,body]
sim_set=set()
fin_set=set()
for k_p,v_p in tmp_items:
for k,v in tmp_items:
if k in sim_set:
print k," in sim_set"
else:
d = final_distance(v,v_p)
if d > threhold:
sim_set.add(k)
fin_set.add(k_p)
fin_sim_set = sim_set - fin_set
fin_2_items.remove()
for item in fin_items.find():
if str(item[_id]) in fin_sim_set:
print " item ",item['_id'],item['title'], " is in fin_sim_set"
else:
fin_2_items.insert(item)
#这里仿照思维,建立质心。
def similiarity_comp(id,arg_list):
'''
input:item
'''
global sim_tmp,sim_group,index
#增加初始化index
title_simhash = Simhash(jieba.cut(arg_list[0]))
if index.count == 0:
index.add(id,title_simhash)
sim_tmp[id]=set()
else:
near_list=index.get_near_dups(title_simhash)
if near_list :
#如果找到临近列表,将点加入到每一个集合中。
for key_id in near_list:
sim_tmp[key_id].add(key_id)
else:
#如果没找到在sim_tmp中创建。并且更新index
sim_tmp[id]=set()
index.add(id,title_simhash)
def cal_item_time(sim_set,**xargs):
# for i in xargs:
old_item_time,new_item_time,last_modified_time=None,None,0
tmp_items={}
time_list=[]
for key_id in sim_set:
tmp_item=fin_items.find({"_id":ObjectId(key_id)},{"_spider.ctime":1})[0]
tmp_items[str(tmp_item["_id"])]=int(tmp_item["_spider"]["ctime"])
for k,v in tmp_items.items():
time_list.append(v)
time_list=sorted(time_list)
if xargs:
print "xargs"
old_item_time,new_item_time,last_modified_time=[v for k,v in xargs.items()]
if old_item_time < time_list[0]:
old_item_time = time_list[0]
if new_item_time > time_list[-1]:
new_item_time = time_list[-1]
last_modified_time = time.time()
else:
# print len(time_list),"len(time_list)"
if len(time_list)>0:
old_item_time=time_list[0]
new_item_time=time_list[-1]
last_modified_time=time.time()
else:
old_item_time = time.time()
new_item_time = time.time()
last_modified_time = time.time()
return old_item_time,new_item_time,int(last_modified_time)
def save_worddict(str_in_list):
global f
'''
结构 字符串长度 length: {"字符串": id}
查找 find({"length":len(str_in_list)})["字符串"]
先实现内存版
'''
word_seq_list=[]
# if not str_in_list:
# pass
# else:
# for word in str_in_list:
# tmp_id=0
# arg_find=str(len(word))+"."+word
# word_find=[x for x in word_dict.find({},{arg_find:1})]
# print word_find
# tmp_id=word_find[word]
# if word_find.hasattr(word):
# tmp_id=word_find[word]
# else:
# tmp_id=len(word_dict[len(word)])+1
# word_dict.insert({str(len(word)),{word:tmp_id}})
# word_seq_list.append([len(word),tmp_id])
global word_dict
#长度 + list版本
# if not str_in_list:
# pass
# else:
# for word in str_in_list:
# tmp_id=0
# length=str(len(word))
# if length in word_dict:
# if word in word_dict[length]:
# tmp_id=word_dict[length][word]
# else:
# word_dict[length]={}
# length_length = len(word_dict[length])
# tmp_id = word_dict[length][word] = length_length+1
# else:
# word_dict[length]={}
# word_dict[length][word]=1
# tmp_id=1
# word_seq_list.append([length,tmp_id])
global g_count
if not str_in_list:
pass
else:
tmp_id=0
for word in str_in_list:
if word in word_dict:
tmp_id=word_dict[word]
else:
tmp_id=word_dict[word]=g_count
g_count+=1
word_seq_list.append(tmp_id)
return word_seq_list
# print "haha"
def get_near_dups(text_feature,sim_tmp):
f_list=[]
if len(sim_tmp)==0:
return None
for key in sim_tmp.keys():
text_feature_key=text_feature_dict[key]
jac_dis=jaccard(set(text_feature),set(text_feature_key))
print "jac_dis",jac_dis
if jac_dis>threshold:
f_list.append(key)
return f_list
def cal_text(key_main,sim_set):
f_list = []
text_tmp=None
if len(sim_set) != 0:
text=[x for x in fin_items.find({"_id":ObjectId(key_main)},{"title":1,"content":1})][0]
text_tmp={"title":text["title"],"content":getwords(strip_tags(text["content"]))}
f_list.append(text_tmp)
for key_id in sim_set:
print key_id
# print [x for x in fin_items.find({"_id":ObjectId(key_id)},{"title":1,"content":1})]
text=[x for x in fin_items.find({"_id":ObjectId(key_id)},{"title":1,"content":1})][0]
# print text
text_tmp={"title":text["title"],"content":getwords(strip_tags(text["content"]))}
f_list.append(text_tmp)
return f_list
def test_process():
'''
文本id化
算jaccard距离
根据此聚类
'''
global sim_tmp,text_group,fin_list,text_feature_dict
# i=1
for item in fin_items.find().limit(100):
item_id=str(item["_id"])
title = item["title"]
body = getwords(strip_tags(item["content"]))
# print "body",body
# print "strip_tags over"
bigram = ngrams(body,2)
long_sentences = cut_sentence(body)
# print "cut_sentence over"
# print "title",title
# print "bigram",bigram
# print "long_sentences",long_sentences
# print bigram
l_text_feature_text =sum([[title],bigram,long_sentences],[])
l_text_feature_id = save_worddict(l_text_feature_text)
text_feature_dict[item_id]=l_text_feature_id
# print "text_feature_dict",text_feature_dict
for item_id,text_feature in text_feature_dict.items():
near_list = get_near_dups(text_feature,sim_tmp)
if near_list:
for key_id in near_list:
sim_tmp[key_id].add(key_id)
else:
sim_tmp[item_id]=set()
text_group.remove()
for key_id,sim_set in sim_tmp.items():
old_item_time,new_item_time,last_modified_time=cal_item_time(sim_set)
text_list=cal_text(key_id,sim_set)
text_group.insert({"key_id":key_id,"sim_items":list(sim_set),"sim_text":text_list,"center":"","last_modified_time":last_modified_time,"old_item_time":old_item_time,"new_item_time":new_item_time})
# fin_items.insert(items.find({"_id":ObjectId(key_id)}))
# text =
# (html)*/
# getwords(text)
# title1="房祖名被正式起诉 社会影响大无缓刑可能"
# title2="房祖名被正式起诉 社会影响大无缓刑可能"
# body1="新浪娱乐讯 据台湾媒体报道,房祖名[微博](本名陈祖明)因“容留他人吸毒罪”,22日遭北京市东城区人民检察院提起公诉。内地资深刑法律师指称,他是公众人物,社会观感影响大,因此不仅没有“缓刑”机会,起诉后也不会让他交保,可能重判2年以上有期徒刑,最快农历年前就锒铛入狱。房祖名经纪人Steven昨说,他也是看到媒体报道才知道,他还没和律师、以及成龙[微博]夫妻联络上。今年8月房祖名、柯震东[微博]吸毒被捕,在两岸三地引发轩然大波;其中,房祖名因涉嫌容留他人吸毒犯罪,自8月14日起被警方关押,9月17日被检方刑事拘留至今。历经近4个月调查,北京东城区检察院正式以“容留他人吸毒罪”,向法院提起公诉。"
# body2="新浪娱乐讯 据台湾媒体报道,房祖名[微博](本名陈祖明)因“容留他人吸毒罪”,22日遭北京市东城区人民检察院提起公诉。内地资深刑法律师指称,他是公众人物,社会观感影响大,因此不仅没有“缓刑”机会,起诉后也不会让他交保,可能重判2年以上有期徒刑,最快农历年前就锒铛入狱。房祖名经纪人Steven昨说,他也是看到媒体报道才知道,他还没和律师、以及成龙[微博]夫妻联络上。  今年8月房祖名、柯震东[微博]吸毒被捕,在两岸三地引发轩然大波;其中,房祖名因涉嫌容留他人吸毒犯罪,自8月14日起被警方关押,9月17日被检方刑事拘留至今。历经近4个月调查,北京东城区检察院正式以“容留他人吸毒罪”,向法院提起公诉。"
# #1.bigram分词。返回list
# bi_list_1=ngrams(title1,2)
# bi_list_2=ngrams(title2,2)
# #2.取长句子,返回list
# long_sents_1=cut_sentence_new(body1)
# long_sents_2=cut_sentence_new(body2)
# words1=sum([bi_list_1,long_sents_1],[])
# words2=sum([bi_list_1,long_sents_1],[])
# t1=["你好","好么","好啊"]
# t2=["你好"]
# print cosine_distance(cosine_words(t1,t2))
# test="nihaoa,zuotiannidouganleshenme。nizuijinhaome。"
# a=[0,1,0,0,0,1]
# b=[1,1,1,1,1,1]
# print cosine_distance(a,b)
# print cut_sentence_new(test)
# print _line2characters(a)
# print len("你好啊")/3
# for i in xrangea:
# print i
# print ngrams(a,2)
if __name__=="__main__":
test_process()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment