Skip to content

Instantly share code, notes, and snippets.

@hmartiniano
Last active October 9, 2019 15:28
Show Gist options
  • Save hmartiniano/04b3bf88ab13285856b62559648fd8f5 to your computer and use it in GitHub Desktop.
Save hmartiniano/04b3bf88ab13285856b62559648fd8f5 to your computer and use it in GitHub Desktop.
Python command line script to run xpath queries on web pages.
#!/user/bin/env python
import sys
import argparse
from lxml import html
from urllib.request import urlopen
def parse(base_href, xpath):
with urlopen(base_href) as f:
tree = html.fromstring(f.read())
tree.make_links_absolute(base_href, resolve_base_href=True)
res = tree.xpath(xpath)
return res
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='xtrct')
parser.add_argument('url', type=str, default=None)
parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
default=sys.stdout)
parser.add_argument("-x", "--xpath", type=str, default='/', help="xpath")
args = parser.parse_args()
r = parse(args.url, args.xpath)
for i in r:
args.outfile.write(str(i) + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment