romanofski/download.py

## download.py
#!/usr/bin/env /usr/bin/python3
#
# Follows links from a main page and downloads subsequent links config.
# The script was written for a specific site, so don't wonder why all
# selectors are hard coded.
#
from lxml import html
from lxml import etree
import requests
import sys


MAIN_PAGE = ''


def get_main_links():
    req = requests.get(MAIN_PAGE, verify=False)
    page = html.document_fromstring(req.text)
    return page.cssselect('.field-item.even table a')


def clean_empty_tags(node):
    """
    Finds all <p> tags with a whitespace in it. They come out broke and
    we won't need them anyways.
    """
    for empty in node.xpath("//p[.='\xa0']"):
        empty.getparent().remove(empty)


def writeout_html():
    for link in get_main_links():
        page = html.document_fromstring(
            requests.get(link.get('href'), verify=False).text)
        page_content = page.cssselect(
            'div.region-content div.field-name-body')[0]
        clean_empty_tags(page_content)
        sys.stdout.write(
            etree.tostring(
                page_content,
                encoding='utf-8',
                pretty_print=False).decode('utf-8')
        )


if __name__ == '__main__':
    writeout_html()
	#!/usr/bin/env /usr/bin/python3
	#
	# Follows links from a main page and downloads subsequent links config.
	# The script was written for a specific site, so don't wonder why all
	# selectors are hard coded.
	#
	from lxml import html
	from lxml import etree
	import requests
	import sys


	MAIN_PAGE = ''


	def get_main_links():
	req = requests.get(MAIN_PAGE, verify=False)
	page = html.document_fromstring(req.text)
	return page.cssselect('.field-item.even table a')


	def clean_empty_tags(node):
	"""
	Finds all <p> tags with a whitespace in it. They come out broke and
	we won't need them anyways.
	"""
	for empty in node.xpath("//p[.='\xa0']"):
	empty.getparent().remove(empty)


	def writeout_html():
	for link in get_main_links():
	page = html.document_fromstring(
	requests.get(link.get('href'), verify=False).text)
	page_content = page.cssselect(
	'div.region-content div.field-name-body')[0]
	clean_empty_tags(page_content)
	sys.stdout.write(
	etree.tostring(
	page_content,
	encoding='utf-8',
	pretty_print=False).decode('utf-8')
	)


	if __name__ == '__main__':
	writeout_html()