Skip to content

Instantly share code, notes, and snippets.

@kenzotakahashi
Last active February 21, 2017 05:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kenzotakahashi/390552e85c05af4c39112dcb116fa969 to your computer and use it in GitHub Desktop.
Save kenzotakahashi/390552e85c05af4c39112dcb116fa969 to your computer and use it in GitHub Desktop.
HTML elementのXpathを取得するスクリプト
from urllib import request
from lxml import etree
import re
def get_index(e):
tag = e.tag
prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag]
next_list = [i for i in e.itersiblings() if i.tag == tag]
if len(prev_list + next_list) == 0:
return None
return len(prev_list) + 1
def is_valid_class(c, siblings):
if re.search(r'[0-9]', c):
return False
c = c.strip()
for sibling in siblings:
if c in sibling:
return False
return True
def get_one_path(e):
index = get_index(e)
index = "[%s]" % (index) if index else ""
this_attrib = e.attrib
if 'id' in this_attrib:
val = this_attrib['id']
if not re.search(r'[0-9]', val):
return e.tag + "[@id='%s']" % (val)
if 'class' in this_attrib:
## 同じタグで同じクラスのものがsiblingにない場合のみclassを使用
tag = e.tag
prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag and 'class' in i.attrib]
next_list = [i for i in e.itersiblings() if i.tag == tag and 'class' in i.attrib]
siblings = [e.attrib['class'].split(' ') for e in prev_list + next_list]
class_list = this_attrib['class'].split(' ')
for c in class_list:
if is_valid_class(c, siblings):
return e.tag + "[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (c)
return e.tag + index
def get_xpath(e):
my_xpath = ''
while True:
path = get_one_path(e)
my_xpath = "/%s%s" % (path, my_xpath)
e = e.getparent()
# root tagまでたどり着いた
if e is None:
return my_xpath
url = 'https://rent.tokyu-housing-lease.co.jp/rent/8016671/6337'
def main():
with request.urlopen(url) as f:
data = f.read().decode('utf-8')
tree = etree.HTML(data)
given_path = "/html/body[@id='diamondtail']/div[@id='wrap']/div[@id='contents_wrap']/div[@id='contents']/div[@id='contents_inner']/div[@id='article']/div[@id='item_detail']/div/table[contains(concat(' ', normalize-space(@class), ' '), ' item_table ')]/tr[5]/th[1]"
p = tree.xpath(given_path)[0]
print(get_xpath(p) == given_path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment