Skip to content

Instantly share code, notes, and snippets.

@myvyang
Created September 7, 2018 09:10
Show Gist options
  • Save myvyang/b142252a72901d0fb8d99fe9bae01aba to your computer and use it in GitHub Desktop.
Save myvyang/b142252a72901d0fb8d99fe9bae01aba to your computer and use it in GitHub Desktop.
# -*- utf-8 -*-
from __future__ import division
NUMBER_TAG = "{NUM}"
class Node(object):
def __init__(self, name):
self.parent = None
self.children = set()
self.name = name
self.attributes = {}
class Status:
Data, RawText, Tag, EndTag, AttrName, AttrValue, Bad = range(7)
class Parser(object):
def __init__(self):
self.whiteTags = set(["body", "head", "html", "button", "dd", "div", "di", "dt",
"form", "iframe", "input", "label", "li", "style",
"table", "tbody", "textarea", "title", "area", "tbody", "tr", "td"])
self.whiteAttrs = set(["class", "id"])
self.selfCloseTags = set(["button", "input"])
self.specialTags = set(["button", "form", "input", "table", "textarea", "iframe", "table"])
@staticmethod
def formatDigital(s):
"""
turn a string to a number-format string.
e.g.
aaaa12nkkm45 -> aaaa{NUM}nkkm{NUM}
"""
if s == None or len(s) == 0:
return ""
bucket = []
for i in range(len(s)-1):
cur = s[i]
next = s[i+1]
if '0' <= cur <= '9':
if not ('0' <= next <= '9'):
bucket.append(NUMBER_TAG)
else:
bucket.append(cur)
last = s[-1]
if '0' <= last <= '9':
bucket.append(NUMBER_TAG)
else:
bucket.append(last)
return "".join(bucket)
@staticmethod
def isTagChar(c):
return ('a' <= c <= 'z') or ('A' <= c <= 'Z')
def parse(self, input):
input += " " * 20
# parent node of current
parentNode = Node("dom")
# current processing node
currentNode = None
# start position of curent node text
start = 0
# current used quote type, ' or "
quoteType = None
# current status of ontext
status = Status.Data
attrName = None
length = len(input)
i = 0
while i < length:
if status == Status.Data:
index = input.find("<", i)
if index < 0:
status = Status.Bad
else:
i = index + 1
if input[i] == '/':
# </xxx
# ^
i += 1
status = Status.EndTag
start = i
else:
# <xxx
# ^
status = Status.Tag
start = i
elif status == Status.Tag:
while Parser.isTagChar(input[i]):
i += 1
# < xx
# ^
if i == start:
status = Status.Data
i += 1
else:
# <xxx
# ^
name = input[start:i]
if name == "script":
status = Status.RawText
else:
if not name in self.whiteTags:
status = Status.Data
else:
# parse a white tag node
currentNode = Node(name)
currentNode.parent = parentNode
parentNode.children.add(currentNode)
# <xxx xxx
# ^
status = Status.AttrName
start = i
elif status == Status.EndTag:
# current position start with </xxxx>
if currentNode == None:
status = Status.Data
else:
index = input.find(">", i)
if index > -1:
name = input[start:index]
tmp = currentNode
while currentNode != None and currentNode.name != name:
currentNode = currentNode.parent
if currentNode == None:
currentNode = tmp
parentNode = currentNode.parent
status = Status.Data
elif status == Status.AttrName:
# <xxx xxx>
# ^
if input[i] == ">":
status = Status.Data
if currentNode.name not in self.selfCloseTags:
parentNode = currentNode
elif input[i] == "/":
# <xxxx xxx /
# ^
index = input.find(">", i)
if index < 0:
status = Status.Bad
else:
while input[i] == " ":
i += 1
start = i
i += 1
elif input[i] == "=":
name = input[start:i].strip()
attrName = name
currentNode.attributes[name] = None
status = Status.AttrValue
i += 1
next = input[i]
if next in ['\'', '"']:
quoteType = next
i += 1
start = i
else:
quoteType = None
else:
i += 1
elif status == Status.AttrValue:
if quoteType in ['\'', '"'] and input.find(quoteType, i) > 0:
index = input.find(quoteType, i)
attrValue = input[start:index].strip()
attrValue = Parser.formatDigital(attrValue)
currentNode.attributes[attrName] = attrValue
i = index + 1
status = Status.AttrName
start = i
else:
status = Status.Bad
elif status == Status.RawText:
index = input.find("</script", i)
if index > -1:
node = Node("script")
parentNode.children.add(node)
node.parent = parentNode
i += 9
start = i
status = Status.Data
else:
status = Status.Bad
elif status == Status.Bad:
i += 1
start = i
status = Status.Data
while parentNode.parent != None:
parentNode = parentNode.parent
return parentNode
def dataToSeq(self, input):
return self.toSeq(self.parse(input))
def dataToStr(self, input):
self.toStr(self.parse(input), 0)
def toStr(self, node, level):
name = node.name
print("%s<%s" % (" " * level, name))
# ignore the sort, b's minhash dont care it.
for name in node.attributes.keys():
value = node.attributes[name]
if name not in self.whiteAttrs:
continue
if value != None:
value = value.replace(" ", "-")
print("%s%s" % (" " * level, value))
for child in node.children:
self.toStr(child, level+1)
def toSeq(self, node):
seq = []
name = node.name
if name in self.specialTags:
seq.append(name)
# ignore the sort, b's minhash dont care it.
for name in node.attributes.keys():
value = node.attributes[name]
if name not in self.whiteAttrs:
continue
if value != None:
value = value.replace(" ", "-")
seq.append(value)
childSeqSet = set()
for child in node.children:
childSeq = self.toSeq(child)
if len(childSeq) > 0:
childSeqSet.add(childSeq)
childSeqs = list(childSeqSet)
for childSeq in childSeqs:
seq.append(childSeq)
return " ".join(seq)
def minhash(set1, set2):
union = set1 | set2
mix = set1 & set2
return len(mix) / len(union)
def test():
import requests
parser = Parser()
url1 = "https://www.taobao.com/"
url2 = "https://detail.1688.com/offer/574230014567.html"
url3 = "https://detail.1688.com/offer/574100377565.html"
d1 = requests.get(url1).text
d2 = requests.get(url2).text
d3 = requests.get(url3).text
seq1 = parser.dataToSeq(d1)
seq2 = parser.dataToSeq(d2)
seq3 = parser.dataToSeq(d3)
s1 = set(seq1.split(" "))
s2 = set(seq2.split(" "))
s3 = set(seq3.split(" "))
print ("1-3 %.6f" % minhash(s1, s2))
print ("1-2 %.6f" % minhash(s1, s2))
print ("2-3 %.6f" % minhash(s2, s3))
if __name__ == "__main__":
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment