myvyang/page_parser.py

## page_parser.py

# -*- utf-8 -*-
from __future__ import division

NUMBER_TAG = "{NUM}"

class Node(object):
    def __init__(self, name):
        self.parent = None
        self.children = set()
        self.name = name
        self.attributes = {}

class Status:
    Data, RawText, Tag, EndTag, AttrName, AttrValue, Bad = range(7)

class Parser(object):
    def __init__(self):
        self.whiteTags = set(["body", "head", "html", "button", "dd", "div", "di", "dt",
                "form", "iframe", "input", "label", "li", "style",
                "table", "tbody", "textarea", "title", "area", "tbody", "tr", "td"])
        self.whiteAttrs = set(["class", "id"])
        self.selfCloseTags = set(["button", "input"])
        self.specialTags = set(["button", "form", "input", "table", "textarea", "iframe", "table"])

    @staticmethod
    def formatDigital(s):
        """
            turn a string to a number-format string.
            e.g.
                aaaa12nkkm45 -> aaaa{NUM}nkkm{NUM}
        """
        if s == None or len(s) == 0:
            return ""

        bucket = []

        for i in range(len(s)-1):
            cur = s[i]
            next = s[i+1]

            if '0' <= cur <= '9':
                if not ('0' <= next <= '9'):
                    bucket.append(NUMBER_TAG)
            else:
                bucket.append(cur)

        last = s[-1]
        if '0' <= last <= '9':
            bucket.append(NUMBER_TAG)
        else:
            bucket.append(last)

        return "".join(bucket)

    @staticmethod
    def isTagChar(c):
        return ('a' <= c <= 'z') or ('A' <= c <= 'Z')

    def parse(self, input):
        input += " " * 20

        # parent node of current
        parentNode = Node("dom")
        # current processing node
        currentNode = None

        # start position of curent node text
        start = 0
        # current used quote type, ' or "
        quoteType = None
        # current status of ontext
        status = Status.Data

        attrName = None

        length = len(input)
        i = 0
        while i < length:
            if status == Status.Data:
                index = input.find("<", i)
                if index < 0:
                    status = Status.Bad
                else:
                    i = index + 1

                    if input[i] == '/':
                        # </xxx
                        #  ^
                        i += 1
                        status = Status.EndTag
                        start = i
                    else:
                        # <xxx
                        #  ^
                        status = Status.Tag
                        start = i
            elif status == Status.Tag:
                while Parser.isTagChar(input[i]):
                    i += 1

                # < xx
                #  ^
                if i == start:
                    status = Status.Data
                    i += 1
                else:
                    # <xxx
                    #     ^
                    name = input[start:i]
                    if name == "script":
                        status = Status.RawText
                    else:
                        if not name in self.whiteTags:
                            status = Status.Data
                        else:
                            # parse a white tag node
                            currentNode = Node(name)
                            currentNode.parent = parentNode
                            parentNode.children.add(currentNode)

                            # <xxx  xxx
                            #      ^
                            status = Status.AttrName
                            start = i
            elif status == Status.EndTag:
                # current position start with </xxxx>
                if currentNode == None:
                    status = Status.Data
                else:
                    index = input.find(">", i)
                    if index > -1:
                        name = input[start:index]

                        tmp = currentNode
                        while currentNode != None and currentNode.name != name:
                            currentNode = currentNode.parent
                        if currentNode == None:
                            currentNode = tmp
                        parentNode = currentNode.parent

                    status = Status.Data
            elif status == Status.AttrName:
                # <xxx xxx>
                #         ^
                if input[i] == ">":
                    status = Status.Data
                    if currentNode.name not in self.selfCloseTags:
                        parentNode = currentNode
                elif input[i] == "/":
                    # <xxxx xxx /
                    #           ^
                    index = input.find(">", i)
                    if index < 0:
                        status = Status.Bad
                    else:
                        while input[i] == " ":
                            i += 1
                        start = i
                        i += 1
                elif input[i] == "=":
                    name = input[start:i].strip()
                    attrName = name
                    currentNode.attributes[name] = None
                    status = Status.AttrValue

                    i += 1
                    next = input[i]
                    if next in ['\'', '"']:
                        quoteType = next
                        i += 1
                        start = i
                    else:
                        quoteType = None
                else:
                    i += 1
            elif status == Status.AttrValue:
                if quoteType in ['\'', '"'] and input.find(quoteType, i) > 0:
                    index = input.find(quoteType, i)
                    attrValue = input[start:index].strip()
                    attrValue = Parser.formatDigital(attrValue)
                    currentNode.attributes[attrName] = attrValue

                    i = index + 1
                    status = Status.AttrName
                    start = i
                else:
                    status = Status.Bad
            elif status == Status.RawText:
                index = input.find("</script", i)
                if index > -1:
                    node = Node("script")
                    parentNode.children.add(node)
                    node.parent = parentNode

                    i += 9
                    start = i
                    status = Status.Data
                else:
                    status = Status.Bad
            elif status == Status.Bad:
                i += 1
                start = i
                status = Status.Data

        while parentNode.parent != None:
            parentNode = parentNode.parent

        return parentNode

    def dataToSeq(self, input):
        return self.toSeq(self.parse(input))

    def dataToStr(self, input):
        self.toStr(self.parse(input), 0)

    def toStr(self, node, level):
        name = node.name
        print("%s<%s" % (" " * level, name))

        # ignore the sort, b's minhash dont care it.
        for name in node.attributes.keys():
            value = node.attributes[name]
            if name not in self.whiteAttrs:
                continue
            if value != None:
                value = value.replace(" ", "-")
                print("%s%s" % (" " * level, value))

        for child in node.children:
            self.toStr(child, level+1)

    def toSeq(self, node):
        seq = []

        name = node.name
        if name in self.specialTags:
            seq.append(name)

        # ignore the sort, b's minhash dont care it.
        for name in node.attributes.keys():
            value = node.attributes[name]
            if name not in self.whiteAttrs:
                continue
            if value != None:
                value = value.replace(" ", "-")
                seq.append(value)

        childSeqSet = set()
        for child in node.children:
            childSeq = self.toSeq(child)
            if len(childSeq) > 0:
                childSeqSet.add(childSeq)

        childSeqs = list(childSeqSet)
        for childSeq in childSeqs:
            seq.append(childSeq)

        return " ".join(seq)

def minhash(set1, set2):
    union = set1 | set2
    mix = set1 & set2

    return len(mix) / len(union)

def test():
    import requests

    parser = Parser()

    url1 = "https://www.taobao.com/"

    url2 = "https://detail.1688.com/offer/574230014567.html"
    url3 = "https://detail.1688.com/offer/574100377565.html"

    d1 = requests.get(url1).text
    d2 = requests.get(url2).text
    d3 = requests.get(url3).text

    seq1 = parser.dataToSeq(d1)
    seq2 = parser.dataToSeq(d2)
    seq3 = parser.dataToSeq(d3)

    s1 = set(seq1.split(" "))
    s2 = set(seq2.split(" "))
    s3 = set(seq3.split(" "))

    print ("1-3 %.6f" % minhash(s1, s2))
    print ("1-2 %.6f" % minhash(s1, s2))
    print ("2-3 %.6f" % minhash(s2, s3))


if __name__ == "__main__":
    test()

	# -- utf-8 --
	from __future__ import division

	NUMBER_TAG = "{NUM}"

	class Node(object):
	def __init__(self, name):
	self.parent = None
	self.children = set()
	self.name = name
	self.attributes = {}

	class Status:
	Data, RawText, Tag, EndTag, AttrName, AttrValue, Bad = range(7)

	class Parser(object):
	def __init__(self):
	self.whiteTags = set(["body", "head", "html", "button", "dd", "div", "di", "dt",
	"form", "iframe", "input", "label", "li", "style",
	"table", "tbody", "textarea", "title", "area", "tbody", "tr", "td"])
	self.whiteAttrs = set(["class", "id"])
	self.selfCloseTags = set(["button", "input"])
	self.specialTags = set(["button", "form", "input", "table", "textarea", "iframe", "table"])

	@staticmethod
	def formatDigital(s):
	"""
	turn a string to a number-format string.
	e.g.
	aaaa12nkkm45 -> aaaa{NUM}nkkm{NUM}
	"""
	if s == None or len(s) == 0:
	return ""

	bucket = []

	for i in range(len(s)-1):
	cur = s[i]
	next = s[i+1]

	if '0' <= cur <= '9':
	if not ('0' <= next <= '9'):
	bucket.append(NUMBER_TAG)
	else:
	bucket.append(cur)

	last = s[-1]
	if '0' <= last <= '9':
	bucket.append(NUMBER_TAG)
	else:
	bucket.append(last)

	return "".join(bucket)

	@staticmethod
	def isTagChar(c):
	return ('a' <= c <= 'z') or ('A' <= c <= 'Z')

	def parse(self, input):
	input += " " * 20

	# parent node of current
	parentNode = Node("dom")
	# current processing node
	currentNode = None

	# start position of curent node text
	start = 0
	# current used quote type, ' or "
	quoteType = None
	# current status of ontext
	status = Status.Data

	attrName = None

	length = len(input)
	i = 0
	while i < length:
	if status == Status.Data:
	index = input.find("<", i)
	if index < 0:
	status = Status.Bad
	else:
	i = index + 1

	if input[i] == '/':
	# </xxx
	# ^
	i += 1
	status = Status.EndTag
	start = i
	else:
	# <xxx
	# ^
	status = Status.Tag
	start = i
	elif status == Status.Tag:
	while Parser.isTagChar(input[i]):
	i += 1

	# < xx
	# ^
	if i == start:
	status = Status.Data
	i += 1
	else:
	# <xxx
	# ^
	name = input[start:i]
	if name == "script":
	status = Status.RawText
	else:
	if not name in self.whiteTags:
	status = Status.Data
	else:
	# parse a white tag node
	currentNode = Node(name)
	currentNode.parent = parentNode
	parentNode.children.add(currentNode)

	# <xxx xxx
	# ^
	status = Status.AttrName
	start = i
	elif status == Status.EndTag:
	# current position start with </xxxx>
	if currentNode == None:
	status = Status.Data
	else:
	index = input.find(">", i)
	if index > -1:
	name = input[start:index]

	tmp = currentNode
	while currentNode != None and currentNode.name != name:
	currentNode = currentNode.parent
	if currentNode == None:
	currentNode = tmp
	parentNode = currentNode.parent

	status = Status.Data
	elif status == Status.AttrName:
	# <xxx xxx>
	# ^
	if input[i] == ">":
	status = Status.Data
	if currentNode.name not in self.selfCloseTags:
	parentNode = currentNode
	elif input[i] == "/":
	# <xxxx xxx /
	# ^
	index = input.find(">", i)
	if index < 0:
	status = Status.Bad
	else:
	while input[i] == " ":
	i += 1
	start = i
	i += 1
	elif input[i] == "=":
	name = input[start:i].strip()
	attrName = name
	currentNode.attributes[name] = None
	status = Status.AttrValue

	i += 1
	next = input[i]
	if next in ['\'', '"']:
	quoteType = next
	i += 1
	start = i
	else:
	quoteType = None
	else:
	i += 1
	elif status == Status.AttrValue:
	if quoteType in ['\'', '"'] and input.find(quoteType, i) > 0:
	index = input.find(quoteType, i)
	attrValue = input[start:index].strip()
	attrValue = Parser.formatDigital(attrValue)
	currentNode.attributes[attrName] = attrValue

	i = index + 1
	status = Status.AttrName
	start = i
	else:
	status = Status.Bad
	elif status == Status.RawText:
	index = input.find("</script", i)
	if index > -1:
	node = Node("script")
	parentNode.children.add(node)
	node.parent = parentNode

	i += 9
	start = i
	status = Status.Data
	else:
	status = Status.Bad
	elif status == Status.Bad:
	i += 1
	start = i
	status = Status.Data

	while parentNode.parent != None:
	parentNode = parentNode.parent

	return parentNode

	def dataToSeq(self, input):
	return self.toSeq(self.parse(input))

	def dataToStr(self, input):
	self.toStr(self.parse(input), 0)

	def toStr(self, node, level):
	name = node.name
	print("%s<%s" % (" " * level, name))

	# ignore the sort, b's minhash dont care it.
	for name in node.attributes.keys():
	value = node.attributes[name]
	if name not in self.whiteAttrs:
	continue
	if value != None:
	value = value.replace(" ", "-")
	print("%s%s" % (" " * level, value))

	for child in node.children:
	self.toStr(child, level+1)

	def toSeq(self, node):
	seq = []

	name = node.name
	if name in self.specialTags:
	seq.append(name)

	# ignore the sort, b's minhash dont care it.
	for name in node.attributes.keys():
	value = node.attributes[name]
	if name not in self.whiteAttrs:
	continue
	if value != None:
	value = value.replace(" ", "-")
	seq.append(value)

	childSeqSet = set()
	for child in node.children:
	childSeq = self.toSeq(child)
	if len(childSeq) > 0:
	childSeqSet.add(childSeq)

	childSeqs = list(childSeqSet)
	for childSeq in childSeqs:
	seq.append(childSeq)

	return " ".join(seq)

	def minhash(set1, set2):
	union = set1 \| set2
	mix = set1 & set2

	return len(mix) / len(union)

	def test():
	import requests

	parser = Parser()

	url1 = "https://www.taobao.com/"

	url2 = "https://detail.1688.com/offer/574230014567.html"
	url3 = "https://detail.1688.com/offer/574100377565.html"

	d1 = requests.get(url1).text
	d2 = requests.get(url2).text
	d3 = requests.get(url3).text

	seq1 = parser.dataToSeq(d1)
	seq2 = parser.dataToSeq(d2)
	seq3 = parser.dataToSeq(d3)

	s1 = set(seq1.split(" "))
	s2 = set(seq2.split(" "))
	s3 = set(seq3.split(" "))

	print ("1-3 %.6f" % minhash(s1, s2))
	print ("1-2 %.6f" % minhash(s1, s2))
	print ("2-3 %.6f" % minhash(s2, s3))


	if __name__ == "__main__":
	test()