ed-flanagan/questionable_archive_scrape.py

## questionable_archive_scrape.py
#!/usr/bin/env python3

'''Print list of all comics from http://questionablecontent.net/archive.php'''

import csv
import re
import sys

from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import Request, urlopen

from bs4 import BeautifulSoup as BS

__author__ = 'Ed Flanagan'
__email__ = 'ed@flanagan.xyz'
__license__ = 'BSD-2-Clause'

BASE_URL = 'http://questionablecontent.net'
ARCHIVE_URL = urljoin(BASE_URL, 'archive.php')


def eprint(*args, **kwargs):
    '''Print to stderr'''

    print(*args, file=sys.stderr, **kwargs)


def scrape_page(url):
    '''Return URL response content'''
    headers = {'User-Agent': 'urllib'}
    req = Request(url, headers=headers)

    try:
        res = urlopen(req)
        encoding = res.info().get_content_charset(failobj='utf-8')
        content = res.read()
        try:
            return content.decode(encoding)
        except UnicodeDecodeError:
            eprint('Failed to decode to unicode.')
            return content
        except ValueError:
            eprint('Failed to decode with charset {!r}'.format(encoding))
            return content
    except (HTTPError, URLError) as err:
        raise err


def get_comic_links(html):
    '''Return list of comic links sorted by id'''

    link_regex = re.compile(r'^view\.php\?comic\=(\d+)')
    title_regex = re.compile(r'Comic\s*(\d+)\:\s*(.+)')

    soup = BS(html, 'html5lib')

    comic_links = soup.find_all('a', href=link_regex)

    links = []
    for link in comic_links:
        title_matches = title_regex.match(link.get_text())

        comic_id = title_matches.group(1)
        comic_title = title_matches.group(2)
        comic_link = urljoin(BASE_URL, link['href'])

        links.append({
            'id': int(comic_id),
            'title': comic_title,
            'link': comic_link
        })

    return sorted(links, key=lambda row: row['id'])


def print_tsv(rows):
    '''Print TSV to stdout'''

    fieldnames = ['id', 'title', 'link']
    writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=fieldnames,
                            quoting=csv.QUOTE_MINIMAL)
    for row in rows:
        writer.writerow(row)


def main():
    '''Main function'''

    # Get archive page content
    try:
        raw_html = scrape_page(ARCHIVE_URL)
    except HTTPError as err:
        eprint('urllib HTTPError:')
        eprint('Error code: ', err.code)
        eprint(err.reason)
        sys.exit(1)
    except URLError as err:
        eprint('urllib URLError:')
        eprint('Reason: ', err.reason)
        sys.exit(1)

    # Extract links from archive page
    links = get_comic_links(raw_html)

    # Print links as TSV to stdout
    print_tsv(links)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	'''Print list of all comics from http://questionablecontent.net/archive.php'''

	import csv
	import re
	import sys

	from urllib.error import HTTPError, URLError
	from urllib.parse import urljoin
	from urllib.request import Request, urlopen

	from bs4 import BeautifulSoup as BS

	__author__ = 'Ed Flanagan'
	__email__ = 'ed@flanagan.xyz'
	__license__ = 'BSD-2-Clause'

	BASE_URL = 'http://questionablecontent.net'
	ARCHIVE_URL = urljoin(BASE_URL, 'archive.php')


	def eprint(args, *kwargs):
	'''Print to stderr'''

	print(args, file=sys.stderr, *kwargs)


	def scrape_page(url):
	'''Return URL response content'''
	headers = {'User-Agent': 'urllib'}
	req = Request(url, headers=headers)

	try:
	res = urlopen(req)
	encoding = res.info().get_content_charset(failobj='utf-8')
	content = res.read()
	try:
	return content.decode(encoding)
	except UnicodeDecodeError:
	eprint('Failed to decode to unicode.')
	return content
	except ValueError:
	eprint('Failed to decode with charset {!r}'.format(encoding))
	return content
	except (HTTPError, URLError) as err:
	raise err


	def get_comic_links(html):
	'''Return list of comic links sorted by id'''

	link_regex = re.compile(r'^view\.php\?comic\=(\d+)')
	title_regex = re.compile(r'Comic\s(\d+)\:\s(.+)')

	soup = BS(html, 'html5lib')

	comic_links = soup.find_all('a', href=link_regex)

	links = []
	for link in comic_links:
	title_matches = title_regex.match(link.get_text())

	comic_id = title_matches.group(1)
	comic_title = title_matches.group(2)
	comic_link = urljoin(BASE_URL, link['href'])

	links.append({
	'id': int(comic_id),
	'title': comic_title,
	'link': comic_link
	})

	return sorted(links, key=lambda row: row['id'])


	def print_tsv(rows):
	'''Print TSV to stdout'''

	fieldnames = ['id', 'title', 'link']
	writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=fieldnames,
	quoting=csv.QUOTE_MINIMAL)
	for row in rows:
	writer.writerow(row)


	def main():
	'''Main function'''

	# Get archive page content
	try:
	raw_html = scrape_page(ARCHIVE_URL)
	except HTTPError as err:
	eprint('urllib HTTPError:')
	eprint('Error code: ', err.code)
	eprint(err.reason)
	sys.exit(1)
	except URLError as err:
	eprint('urllib URLError:')
	eprint('Reason: ', err.reason)
	sys.exit(1)

	# Extract links from archive page
	links = get_comic_links(raw_html)

	# Print links as TSV to stdout
	print_tsv(links)

	if __name__ == '__main__':
	main()