Created
January 30, 2018 10:57
-
-
Save SylvanasSun/413397766e04b2af6d0ef13ae03ac292 to your computer and use it in GitHub Desktop.
In computer science, SimHash is a technique for quickly estimating how similar two sets are. The algorithm is used by the Google Crawler to find near duplicate pages. It was created by Moses Charikar.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Created by SylvanasSun in 2017.10.17 | |
# !/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import collections | |
import jieba | |
from jieba import analyse | |
# TODO: Change default hash algorithms to the other algorithms of high-performance. | |
def _default_hashfunc(content, hashbits): | |
""" | |
Default hash function is variable-length version of Python's builtin hash. | |
:param content: data that needs to hash. | |
:return: return a decimal number. | |
""" | |
if content == "": | |
return 0 | |
x = ord(content[0]) << 7 | |
m = 1000003 | |
mask = 2 ** hashbits - 1 | |
for c in content: | |
x = ((x * m) ^ ord(c)) & mask | |
x ^= len(content) | |
if x == -1: | |
x = -2 | |
return x | |
# TODO: Change default toknizer to the c/c++ version or other tokenizer of high-performance. | |
def _default_tokenizer_func(content, keyword_weight_pair): | |
""" | |
Default tokenizer function that uses jieba tokenizer. | |
:param keyword_weight_pair: maximum pair number of the keyword-weight list. | |
:return: return keyword-weight list. Example: [('Example',0.4511233019962264),('Hello',0.25548051420382073),...]. | |
""" | |
seg_list = jieba.lcut_for_search(content) | |
# Extract keyword-weight list by TF-IDF algorithms and by sorted maximum weight | |
return jieba.analyse.extract_tags("".join(seg_list), topK=keyword_weight_pair, withWeight=True) | |
class Simhash(object): | |
""" | |
Class Simhash implements simhash algorithms of the Google for filter duplicate content. | |
Simhash algorithms idea is will reduce the dimension of content and compares the | |
difference of the "Hamming Distance" implements filter duplicate content. | |
About simhash algorithms the more introduction: https://en.wikipedia.org/wiki/SimHash | |
Simhash default tokenizer is jieba (https://github.com/fxsjy/jieba). | |
""" | |
def __init__(self, data, keyword_weight_pair=20, hash_bit_number=64, hashfunc=None, tokenizer_func=None): | |
""" | |
:param data: data that needs to be encode. | |
:param keyword_weight_pair: maximum pair number of the keyword-weight list. | |
:param hash_bit_number: maximum bit number for hashcode. | |
:param hashfunc: hash function,its first parameter must be data that needs to be encode | |
and the second parameter must be hash bit number. | |
:param tokenizer_func: tokenizer function,its first parameter must be content that | |
needs to be tokenizer and the second parameter must be | |
keyword_weight_pair. | |
""" | |
if hashfunc is None: | |
self.hashfunc = _default_hashfunc | |
else: | |
self.hashfunc = hashfunc | |
if tokenizer_func is None: | |
self.tokenizer_func = _default_tokenizer_func | |
else: | |
self.tokenizer_func = tokenizer_func | |
self.hash_bit_number = hash_bit_number | |
self.keyword_weight_pari = keyword_weight_pair | |
if isinstance(data, Simhash): | |
self.hash = data.hash | |
elif isinstance(data, int): | |
self.hash = data | |
else: | |
self.simhash(data) | |
def __str__(self): | |
return str(self.hash) | |
def simhash(self, content): | |
""" | |
Select policies for simhash on the different types of content. | |
""" | |
if content is None: | |
self.hash = -1 | |
return | |
if isinstance(content, str): | |
features = self.tokenizer_func(content, self.keyword_weight_pari) | |
self.hash = self.build_from_features(features) | |
elif isinstance(content, collections.Iterable): | |
self.hash = self.build_from_features(content) | |
elif isinstance(content, int): | |
self.hash = content | |
else: | |
raise Exception("Unsupported parameter type %s" % type(content)) | |
def build_from_features(self, features): | |
""" | |
:param features: a list of (token,weight) tuples or a token -> weight dict, | |
if is a string so it need compute weight (a weight of 1 will be assumed). | |
:return: a decimal digit for the accumulative result of each after handled features-weight pair. | |
""" | |
v = [0] * self.hash_bit_number | |
if isinstance(features, dict): | |
features = features.items() | |
# Starting longitudinal accumulation of bits, current bit add current weight | |
# when the current bits equal 1 and else current bit minus the current weight. | |
for f in features: | |
if isinstance(f, str): | |
h = self.hashfunc(f, self.hash_bit_number) | |
w = 1 | |
else: | |
assert isinstance(f, collections.Iterable) | |
h = self.hashfunc(f[0], self.hash_bit_number) | |
w = f[1] | |
for i in range(self.hash_bit_number): | |
bitmask = 1 << i | |
v[i] += w if h & bitmask else -w | |
# Just record weight of the non-negative | |
fingerprint = 0 | |
for i in range(self.hash_bit_number): | |
if v[i] >= 0: | |
fingerprint += 1 << i | |
return fingerprint | |
def is_equal(self, another, limit=0.8): | |
""" | |
Determine two simhash are similar or not similar. | |
:param another: another simhash. | |
:param limit: a limit of the similarity. | |
:return: if similarity greater than limit return true and else return false. | |
""" | |
if another is None: | |
raise Exception("Parameter another is null") | |
if isinstance(another, int): | |
distance = self.hamming_distance(another) | |
elif isinstance(another, Simhash): | |
assert self.hash_bit_number == another.hash_bit_number | |
distance = self.hamming_distance(another.hash) | |
else: | |
raise Exception("Unsupported parameter type %s" % type(another)) | |
similarity = float(self.hash_bit_number - distance) / self.hash_bit_number | |
if similarity > limit: | |
return True | |
return False | |
def hamming_distance(self, another): | |
""" | |
Compute hamming distance,hamming distance is a total number of different bits of two binary numbers. | |
:param another: another simhash value. | |
:return: a hamming distance that current simhash and another simhash. | |
""" | |
x = (self.hash ^ another) & ((1 << self.hash_bit_number) - 1) | |
result = 0 | |
while x: | |
result += 1 | |
x &= x - 1 | |
return result | |
if __name__ == "__main__": | |
sentence_A = """ | |
明朝军制建立在军户制度上,军户即为中国古代世代从军、充当军差的人户。 | |
东晋南北朝时,士兵及家属的户籍隶于军府称为军户。军户子弟世袭为兵未经准许不得脱离军籍。 | |
北魏军户亦有用俘虏充当的。元朝实行军户制度,军户必须出成年男子到军队服役,父死子替,兄亡弟代,世代相袭。 | |
""" | |
sentence_B = """ | |
明朝的军制是在元朝基础上改进,而没有采用唐宋时期的募兵制。 | |
元朝的军制是建立在游牧民族制度上发展而来,游牧民族在战争是全民征兵,实际上是军户制度。 | |
建立元朝以后,蒙古族还是全部军户,对于占领区招降的军队,也实行军户制度。 | |
""" | |
sentence_C = "You know nothing Jon Snow!" | |
sentence_D = "Jon Snow: I konw nothing." | |
simhash_A = Simhash(sentence_A) | |
simhash_B = Simhash(sentence_B) | |
simhash_C = Simhash(sentence_C) | |
simhash_D = Simhash(sentence_D) | |
print(simhash_A) | |
print(simhash_B) | |
print(simhash_C) | |
print(simhash_D) | |
assert simhash_A.is_equal(simhash_B) is True | |
assert simhash_B.is_equal(simhash_C) is False | |
assert simhash_C.is_equal(simhash_D) is True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment