gipi/pec.py

## pec.py
import re
from pathlib import Path
import os
import sys
import requests

import logging


logging.basicConfig()
logger = logging.getLogger()
logger.setLevel('INFO')


from requests_html import HTML

COLUMN_EMAIL = 3


URL_ROOT_DOMAIN = 'http://www.comuniverso.it/'
URL_ROOT_PAGE = 'index.cfm?Comuni_con_PEC&menu=271'
DOWNLOADS_FOLDER = 'downloads'
DIR = os.path.dirname(__file__)

get_download_path = lambda x: Path(DIR)/DOWNLOADS_FOLDER/x


def download_and_save(url, filename):
    '''download url and save as filename if filename doesn't exist'''
    download_dir = Path(DIR)/DOWNLOADS_FOLDER
    download_path = download_dir/filename

    if not download_dir.exists():
        logger.info('creating directory \'%s\'' % download_dir)
        download_dir.mkdir(parents=True)

    if download_path.exists():
        return download_path

    logger.info('\'%s\' doesn\'t exist, I\'ll download it using \'%s\'' % (download_path, url))

    response = requests.get(URL_ROOT_DOMAIN + url)

    if response.status_code != 200:
        raise Exception('connection to \'%s\' failed' % url)

    with download_path.open("w") as f:
        f.write(response.text)

    return download_path


def usage(argname):
    print("usage: %s" % argname)
    sys.exit(1)


def csv_from_html(filepath):
    ret = []
    with open(filepath) as f:
        contents = f.read()

        html = HTML(html=contents)

        table = html.xpath('/html/body/a/table')[0]
        tbody = table.find('tbody')[0]

        for row in tbody.find('tr'):
            line = []
            for idx, column in enumerate(row.find('td')):
                if idx == COLUMN_EMAIL:
                    line.append(column.find('a')[0].attrs['href'][len('mailto:'):])
                else:
                    line.append(column.text)

            ret.append(','.join(line))

    return ret


def parse_root_page(filename):
    filepath = get_download_path(filename)

    ret = []

    with filepath.open() as f:
        contents = f.read()

        html = HTML(html=contents)


        for row in [_ for _ in html.find('table')[0].find('table')][2].find('tr'):
            has_link = len(row.find('a')) > 0

            if not has_link:
                continue

            link = row.find('a')[0].attrs['href']
            regione = row.text.lower().split('\n')[0] # there are a couple of '\n'

            ret.append((regione, link))

    return ret

def save_csv(url, name):
    path = download_and_save(url, '%s.html' % name)
    lines = csv_from_html(path)

    with Path('%s.csv' % name).open("w") as f:
        for line in lines:
            f.write(line)
            f.write('\n')


def retrieve_regions():
    download_and_save(URL_ROOT_PAGE, 'root.html')
    return parse_root_page('root.html')

if __name__ == '__main__':
    urls = retrieve_regions()

    for region_name, url in urls:
        save_csv(url, region_name)
	import re
	from pathlib import Path
	import os
	import sys
	import requests

	import logging


	logging.basicConfig()
	logger = logging.getLogger()
	logger.setLevel('INFO')



	from requests_html import HTML

	COLUMN_EMAIL = 3


	URL_ROOT_DOMAIN = 'http://www.comuniverso.it/'
	URL_ROOT_PAGE = 'index.cfm?Comuni_con_PEC&menu=271'
	DOWNLOADS_FOLDER = 'downloads'
	DIR = os.path.dirname(__file__)

	get_download_path = lambda x: Path(DIR)/DOWNLOADS_FOLDER/x


	def download_and_save(url, filename):
	'''download url and save as filename if filename doesn't exist'''
	download_dir = Path(DIR)/DOWNLOADS_FOLDER
	download_path = download_dir/filename

	if not download_dir.exists():
	logger.info('creating directory \'%s\'' % download_dir)
	download_dir.mkdir(parents=True)

	if download_path.exists():
	return download_path

	logger.info('\'%s\' doesn\'t exist, I\'ll download it using \'%s\'' % (download_path, url))

	response = requests.get(URL_ROOT_DOMAIN + url)

	if response.status_code != 200:
	raise Exception('connection to \'%s\' failed' % url)

	with download_path.open("w") as f:
	f.write(response.text)

	return download_path


	def usage(argname):
	print("usage: %s" % argname)
	sys.exit(1)


	def csv_from_html(filepath):
	ret = []
	with open(filepath) as f:
	contents = f.read()

	html = HTML(html=contents)

	table = html.xpath('/html/body/a/table')[0]
	tbody = table.find('tbody')[0]

	for row in tbody.find('tr'):
	line = []
	for idx, column in enumerate(row.find('td')):
	if idx == COLUMN_EMAIL:
	line.append(column.find('a')[0].attrs['href'][len('mailto:'):])
	else:
	line.append(column.text)

	ret.append(','.join(line))

	return ret


	def parse_root_page(filename):
	filepath = get_download_path(filename)

	ret = []

	with filepath.open() as f:
	contents = f.read()

	html = HTML(html=contents)


	for row in [_ for _ in html.find('table')[0].find('table')][2].find('tr'):
	has_link = len(row.find('a')) > 0

	if not has_link:
	continue

	link = row.find('a')[0].attrs['href']
	regione = row.text.lower().split('\n')[0] # there are a couple of '\n'

	ret.append((regione, link))

	return ret

	def save_csv(url, name):
	path = download_and_save(url, '%s.html' % name)
	lines = csv_from_html(path)

	with Path('%s.csv' % name).open("w") as f:
	for line in lines:
	f.write(line)
	f.write('\n')


	def retrieve_regions():
	download_and_save(URL_ROOT_PAGE, 'root.html')
	return parse_root_page('root.html')

	if __name__ == '__main__':
	urls = retrieve_regions()

	for region_name, url in urls:
	save_csv(url, region_name)