kenzotakahashi/get_xpath.py

## get_xpath.py
from urllib import request
from lxml import etree
import re

def get_index(e):
	tag = e.tag
	prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag]
	next_list = [i for i in e.itersiblings() if i.tag == tag]
	if len(prev_list + next_list) == 0:
		return None
	return len(prev_list) + 1

def is_valid_class(c, siblings):
	if re.search(r'[0-9]', c):
		return False
	c = c.strip()
	for sibling in siblings:
		if c in sibling:
			return False
	return True

def get_one_path(e):
	index = get_index(e)
	index = "[%s]" % (index) if index else ""
	this_attrib = e.attrib
	if 'id' in this_attrib:
		val = this_attrib['id']
		if not re.search(r'[0-9]', val):
			return e.tag + "[@id='%s']" % (val)
	if 'class' in this_attrib:
		## 同じタグで同じクラスのものがsiblingにない場合のみclassを使用
		tag = e.tag
		prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag and 'class' in i.attrib]
		next_list = [i for i in e.itersiblings() if i.tag == tag and 'class' in i.attrib]
		siblings = [e.attrib['class'].split(' ') for e in prev_list + next_list]
		class_list = this_attrib['class'].split(' ')
		for c in class_list:
			if is_valid_class(c, siblings):
				return e.tag + "[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (c)
	return e.tag + index

def get_xpath(e):
	my_xpath = ''
	while True:
		path = get_one_path(e)
		my_xpath = "/%s%s" % (path, my_xpath)
		e = e.getparent()
		# root tagまでたどり着いた
		if e is None:
			return my_xpath

url = 'https://rent.tokyu-housing-lease.co.jp/rent/8016671/6337'
def main():
	with request.urlopen(url) as f:
		data = f.read().decode('utf-8')
		tree = etree.HTML(data)
		given_path = "/html/body[@id='diamondtail']/div[@id='wrap']/div[@id='contents_wrap']/div[@id='contents']/div[@id='contents_inner']/div[@id='article']/div[@id='item_detail']/div/table[contains(concat(' ', normalize-space(@class), ' '), ' item_table ')]/tr[5]/th[1]"
		p = tree.xpath(given_path)[0]
		print(get_xpath(p) == given_path)

if __name__ == '__main__':
	main()
	from urllib import request
	from lxml import etree
	import re

	def get_index(e):
	tag = e.tag
	prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag]
	next_list = [i for i in e.itersiblings() if i.tag == tag]
	if len(prev_list + next_list) == 0:
	return None
	return len(prev_list) + 1

	def is_valid_class(c, siblings):
	if re.search(r'[0-9]', c):
	return False
	c = c.strip()
	for sibling in siblings:
	if c in sibling:
	return False
	return True

	def get_one_path(e):
	index = get_index(e)
	index = "[%s]" % (index) if index else ""
	this_attrib = e.attrib
	if 'id' in this_attrib:
	val = this_attrib['id']
	if not re.search(r'[0-9]', val):
	return e.tag + "[@id='%s']" % (val)
	if 'class' in this_attrib:
	## 同じタグで同じクラスのものがsiblingにない場合のみclassを使用
	tag = e.tag
	prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag and 'class' in i.attrib]
	next_list = [i for i in e.itersiblings() if i.tag == tag and 'class' in i.attrib]
	siblings = [e.attrib['class'].split(' ') for e in prev_list + next_list]
	class_list = this_attrib['class'].split(' ')
	for c in class_list:
	if is_valid_class(c, siblings):
	return e.tag + "[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (c)
	return e.tag + index

	def get_xpath(e):
	my_xpath = ''
	while True:
	path = get_one_path(e)
	my_xpath = "/%s%s" % (path, my_xpath)
	e = e.getparent()
	# root tagまでたどり着いた
	if e is None:
	return my_xpath

	url = 'https://rent.tokyu-housing-lease.co.jp/rent/8016671/6337'
	def main():
	with request.urlopen(url) as f:
	data = f.read().decode('utf-8')
	tree = etree.HTML(data)
	given_path = "/html/body[@id='diamondtail']/div[@id='wrap']/div[@id='contents_wrap']/div[@id='contents']/div[@id='contents_inner']/div[@id='article']/div[@id='item_detail']/div/table[contains(concat(' ', normalize-space(@class), ' '), ' item_table ')]/tr[5]/th[1]"
	p = tree.xpath(given_path)[0]
	print(get_xpath(p) == given_path)

	if __name__ == '__main__':
	main()