Skip to content

Instantly share code, notes, and snippets.

@romanofski
Created August 5, 2014 23:10
Show Gist options
  • Save romanofski/d6c7076da35d5e11fed7 to your computer and use it in GitHub Desktop.
Save romanofski/d6c7076da35d5e11fed7 to your computer and use it in GitHub Desktop.
Follows links from a main page and downloads subsequent links config
#!/usr/bin/env /usr/bin/python3
#
# Follows links from a main page and downloads subsequent links config.
# The script was written for a specific site, so don't wonder why all
# selectors are hard coded.
#
from lxml import html
from lxml import etree
import requests
import sys
MAIN_PAGE = ''
def get_main_links():
req = requests.get(MAIN_PAGE, verify=False)
page = html.document_fromstring(req.text)
return page.cssselect('.field-item.even table a')
def clean_empty_tags(node):
"""
Finds all <p> tags with a whitespace in it. They come out broke and
we won't need them anyways.
"""
for empty in node.xpath("//p[.='\xa0']"):
empty.getparent().remove(empty)
def writeout_html():
for link in get_main_links():
page = html.document_fromstring(
requests.get(link.get('href'), verify=False).text)
page_content = page.cssselect(
'div.region-content div.field-name-body')[0]
clean_empty_tags(page_content)
sys.stdout.write(
etree.tostring(
page_content,
encoding='utf-8',
pretty_print=False).decode('utf-8')
)
if __name__ == '__main__':
writeout_html()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment