Created
September 7, 2018 09:10
-
-
Save myvyang/b142252a72901d0fb8d99fe9bae01aba to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- utf-8 -*- | |
from __future__ import division | |
NUMBER_TAG = "{NUM}" | |
class Node(object): | |
def __init__(self, name): | |
self.parent = None | |
self.children = set() | |
self.name = name | |
self.attributes = {} | |
class Status: | |
Data, RawText, Tag, EndTag, AttrName, AttrValue, Bad = range(7) | |
class Parser(object): | |
def __init__(self): | |
self.whiteTags = set(["body", "head", "html", "button", "dd", "div", "di", "dt", | |
"form", "iframe", "input", "label", "li", "style", | |
"table", "tbody", "textarea", "title", "area", "tbody", "tr", "td"]) | |
self.whiteAttrs = set(["class", "id"]) | |
self.selfCloseTags = set(["button", "input"]) | |
self.specialTags = set(["button", "form", "input", "table", "textarea", "iframe", "table"]) | |
@staticmethod | |
def formatDigital(s): | |
""" | |
turn a string to a number-format string. | |
e.g. | |
aaaa12nkkm45 -> aaaa{NUM}nkkm{NUM} | |
""" | |
if s == None or len(s) == 0: | |
return "" | |
bucket = [] | |
for i in range(len(s)-1): | |
cur = s[i] | |
next = s[i+1] | |
if '0' <= cur <= '9': | |
if not ('0' <= next <= '9'): | |
bucket.append(NUMBER_TAG) | |
else: | |
bucket.append(cur) | |
last = s[-1] | |
if '0' <= last <= '9': | |
bucket.append(NUMBER_TAG) | |
else: | |
bucket.append(last) | |
return "".join(bucket) | |
@staticmethod | |
def isTagChar(c): | |
return ('a' <= c <= 'z') or ('A' <= c <= 'Z') | |
def parse(self, input): | |
input += " " * 20 | |
# parent node of current | |
parentNode = Node("dom") | |
# current processing node | |
currentNode = None | |
# start position of curent node text | |
start = 0 | |
# current used quote type, ' or " | |
quoteType = None | |
# current status of ontext | |
status = Status.Data | |
attrName = None | |
length = len(input) | |
i = 0 | |
while i < length: | |
if status == Status.Data: | |
index = input.find("<", i) | |
if index < 0: | |
status = Status.Bad | |
else: | |
i = index + 1 | |
if input[i] == '/': | |
# </xxx | |
# ^ | |
i += 1 | |
status = Status.EndTag | |
start = i | |
else: | |
# <xxx | |
# ^ | |
status = Status.Tag | |
start = i | |
elif status == Status.Tag: | |
while Parser.isTagChar(input[i]): | |
i += 1 | |
# < xx | |
# ^ | |
if i == start: | |
status = Status.Data | |
i += 1 | |
else: | |
# <xxx | |
# ^ | |
name = input[start:i] | |
if name == "script": | |
status = Status.RawText | |
else: | |
if not name in self.whiteTags: | |
status = Status.Data | |
else: | |
# parse a white tag node | |
currentNode = Node(name) | |
currentNode.parent = parentNode | |
parentNode.children.add(currentNode) | |
# <xxx xxx | |
# ^ | |
status = Status.AttrName | |
start = i | |
elif status == Status.EndTag: | |
# current position start with </xxxx> | |
if currentNode == None: | |
status = Status.Data | |
else: | |
index = input.find(">", i) | |
if index > -1: | |
name = input[start:index] | |
tmp = currentNode | |
while currentNode != None and currentNode.name != name: | |
currentNode = currentNode.parent | |
if currentNode == None: | |
currentNode = tmp | |
parentNode = currentNode.parent | |
status = Status.Data | |
elif status == Status.AttrName: | |
# <xxx xxx> | |
# ^ | |
if input[i] == ">": | |
status = Status.Data | |
if currentNode.name not in self.selfCloseTags: | |
parentNode = currentNode | |
elif input[i] == "/": | |
# <xxxx xxx / | |
# ^ | |
index = input.find(">", i) | |
if index < 0: | |
status = Status.Bad | |
else: | |
while input[i] == " ": | |
i += 1 | |
start = i | |
i += 1 | |
elif input[i] == "=": | |
name = input[start:i].strip() | |
attrName = name | |
currentNode.attributes[name] = None | |
status = Status.AttrValue | |
i += 1 | |
next = input[i] | |
if next in ['\'', '"']: | |
quoteType = next | |
i += 1 | |
start = i | |
else: | |
quoteType = None | |
else: | |
i += 1 | |
elif status == Status.AttrValue: | |
if quoteType in ['\'', '"'] and input.find(quoteType, i) > 0: | |
index = input.find(quoteType, i) | |
attrValue = input[start:index].strip() | |
attrValue = Parser.formatDigital(attrValue) | |
currentNode.attributes[attrName] = attrValue | |
i = index + 1 | |
status = Status.AttrName | |
start = i | |
else: | |
status = Status.Bad | |
elif status == Status.RawText: | |
index = input.find("</script", i) | |
if index > -1: | |
node = Node("script") | |
parentNode.children.add(node) | |
node.parent = parentNode | |
i += 9 | |
start = i | |
status = Status.Data | |
else: | |
status = Status.Bad | |
elif status == Status.Bad: | |
i += 1 | |
start = i | |
status = Status.Data | |
while parentNode.parent != None: | |
parentNode = parentNode.parent | |
return parentNode | |
def dataToSeq(self, input): | |
return self.toSeq(self.parse(input)) | |
def dataToStr(self, input): | |
self.toStr(self.parse(input), 0) | |
def toStr(self, node, level): | |
name = node.name | |
print("%s<%s" % (" " * level, name)) | |
# ignore the sort, b's minhash dont care it. | |
for name in node.attributes.keys(): | |
value = node.attributes[name] | |
if name not in self.whiteAttrs: | |
continue | |
if value != None: | |
value = value.replace(" ", "-") | |
print("%s%s" % (" " * level, value)) | |
for child in node.children: | |
self.toStr(child, level+1) | |
def toSeq(self, node): | |
seq = [] | |
name = node.name | |
if name in self.specialTags: | |
seq.append(name) | |
# ignore the sort, b's minhash dont care it. | |
for name in node.attributes.keys(): | |
value = node.attributes[name] | |
if name not in self.whiteAttrs: | |
continue | |
if value != None: | |
value = value.replace(" ", "-") | |
seq.append(value) | |
childSeqSet = set() | |
for child in node.children: | |
childSeq = self.toSeq(child) | |
if len(childSeq) > 0: | |
childSeqSet.add(childSeq) | |
childSeqs = list(childSeqSet) | |
for childSeq in childSeqs: | |
seq.append(childSeq) | |
return " ".join(seq) | |
def minhash(set1, set2): | |
union = set1 | set2 | |
mix = set1 & set2 | |
return len(mix) / len(union) | |
def test(): | |
import requests | |
parser = Parser() | |
url1 = "https://www.taobao.com/" | |
url2 = "https://detail.1688.com/offer/574230014567.html" | |
url3 = "https://detail.1688.com/offer/574100377565.html" | |
d1 = requests.get(url1).text | |
d2 = requests.get(url2).text | |
d3 = requests.get(url3).text | |
seq1 = parser.dataToSeq(d1) | |
seq2 = parser.dataToSeq(d2) | |
seq3 = parser.dataToSeq(d3) | |
s1 = set(seq1.split(" ")) | |
s2 = set(seq2.split(" ")) | |
s3 = set(seq3.split(" ")) | |
print ("1-3 %.6f" % minhash(s1, s2)) | |
print ("1-2 %.6f" % minhash(s1, s2)) | |
print ("2-3 %.6f" % minhash(s2, s3)) | |
if __name__ == "__main__": | |
test() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment