rosstex/soup_xpath_gen.py

## soup_xpath_gen.py
import html

BAD_CHARS = set(["\"", "'", "[", "]"])

# generates an xpath string for a given BeautifulSoup element
def soup_xpath_gen(element):
    xpath = ""
    while element.name != 'document':
        if element.name == 'html':
            xpath = "/html/" + xpath
            break  # second to last
        else:
            items = list(element.attrs.items())
            if not items:
                el_xpath = str(element.name)
            else:
                el_xpath = str(element.name) + "["
                one = False
                for i, (k, v) in enumerate(items):
                    if not any(char in BAD_CHARS for char in v):
                        if k == "title":
                            continue
                        if one:
                            el_xpath = el_xpath + " and "
                        one = True
                        if "/" in v:  # URL matching is wonky, so we ignore
                            el_xpath += "normalize-space(@%s)" % k
                        else:
                            if isinstance(v, list):
                                v = " ".join(v)
                            el_xpath += "normalize-space(@%s)=normalize-space(\'%s\')" % (k, html.escape(v))
                el_xpath += "]"
            xpath = el_xpath + "/" + xpath
        element = element.parent
    return xpath.rstrip("/")
	import html

	BAD_CHARS = set(["\"", "'", "[", "]"])

	# generates an xpath string for a given BeautifulSoup element
	def soup_xpath_gen(element):
	xpath = ""
	while element.name != 'document':
	if element.name == 'html':
	xpath = "/html/" + xpath
	break # second to last
	else:
	items = list(element.attrs.items())
	if not items:
	el_xpath = str(element.name)
	else:
	el_xpath = str(element.name) + "["
	one = False
	for i, (k, v) in enumerate(items):
	if not any(char in BAD_CHARS for char in v):
	if k == "title":
	continue
	if one:
	el_xpath = el_xpath + " and "
	one = True
	if "/" in v: # URL matching is wonky, so we ignore
	el_xpath += "normalize-space(@%s)" % k
	else:
	if isinstance(v, list):
	v = " ".join(v)
	el_xpath += "normalize-space(@%s)=normalize-space(\'%s\')" % (k, html.escape(v))
	el_xpath += "]"
	xpath = el_xpath + "/" + xpath
	element = element.parent
	return xpath.rstrip("/")