beannguyen/xpath_soup.py

## xpath_soup.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

def xpath_soup(element):
    """
    Generate xpath from BeautifulSoup4 element

    :param element: BeautifulSoup4 element.
    :type element: bs4.element.Tag or bs4.element.NavigableString
    :return: xpath as string
    :rtype: str

    Usage:

    >>> import bs4
    >>> html = (
    ...     '<html><head><title>title</title></head>'
    ...     '<body><p>p <i>1</i></p><p>p <i>2</i></p></body></html>'
    ...     )
    >>> soup = bs4.BeautifulSoup(html, 'html.parser')
    >>> xpath_soup(soup.html.body.p.i)
    '/html/body/p[1]/i'

    """
    components = []
    child = element if element.name else element.parent
    for parent in child.parents:
        """
        @type parent: bs4.element.Tag
        """
        siblings = parent.find_all(child.name, recursive=False)
        components.append(
            child.name
            if siblings == [child] else
            '%s[%d]' % (child.name, 1 + siblings.index(child))
            )
        child = parent
    components.reverse()
    return '/%s' % '/'.join(components)


if __name__ == '__main__':
    import doctest
    doctest.testmod(verbose=True)
	#!/usr/bin/python
	# -- coding: utf-8 --

	def xpath_soup(element):
	"""
	Generate xpath from BeautifulSoup4 element

	:param element: BeautifulSoup4 element.
	:type element: bs4.element.Tag or bs4.element.NavigableString
	:return: xpath as string
	:rtype: str

	Usage:

	>>> import bs4
	>>> html = (
	... '<html><head><title>title</title></head>'
	... '<body><p>p <i>1</i></p><p>p <i>2</i></p></body></html>'
	... )
	>>> soup = bs4.BeautifulSoup(html, 'html.parser')
	>>> xpath_soup(soup.html.body.p.i)
	'/html/body/p[1]/i'

	"""
	components = []
	child = element if element.name else element.parent
	for parent in child.parents:
	"""
	@type parent: bs4.element.Tag
	"""
	siblings = parent.find_all(child.name, recursive=False)
	components.append(
	child.name
	if siblings == [child] else
	'%s[%d]' % (child.name, 1 + siblings.index(child))
	)
	child = parent
	components.reverse()
	return '/%s' % '/'.join(components)


	if __name__ == '__main__':
	import doctest
	doctest.testmod(verbose=True)