Skip to content

Instantly share code, notes, and snippets.

@gipi

gipi/pec.py

Last active Jul 18, 2019
Embed
What would you like to do?
Crea dei CSV con l'elenco degli indirizzi PEC dei vari comuni italiani. Dalla pagina principale recupera l'URL delle singole regioni da cui crea il corrispondente CSV.
import re
from pathlib import Path
import os
import sys
import requests
import logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel('INFO')
from requests_html import HTML
COLUMN_EMAIL = 3
URL_ROOT_DOMAIN = 'http://www.comuniverso.it/'
URL_ROOT_PAGE = 'index.cfm?Comuni_con_PEC&menu=271'
DOWNLOADS_FOLDER = 'downloads'
DIR = os.path.dirname(__file__)
get_download_path = lambda x: Path(DIR)/DOWNLOADS_FOLDER/x
def download_and_save(url, filename):
'''download url and save as filename if filename doesn't exist'''
download_dir = Path(DIR)/DOWNLOADS_FOLDER
download_path = download_dir/filename
if not download_dir.exists():
logger.info('creating directory \'%s\'' % download_dir)
download_dir.mkdir(parents=True)
if download_path.exists():
return download_path
logger.info('\'%s\' doesn\'t exist, I\'ll download it using \'%s\'' % (download_path, url))
response = requests.get(URL_ROOT_DOMAIN + url)
if response.status_code != 200:
raise Exception('connection to \'%s\' failed' % url)
with download_path.open("w") as f:
f.write(response.text)
return download_path
def usage(argname):
print("usage: %s" % argname)
sys.exit(1)
def csv_from_html(filepath):
ret = []
with open(filepath) as f:
contents = f.read()
html = HTML(html=contents)
table = html.xpath('/html/body/a/table')[0]
tbody = table.find('tbody')[0]
for row in tbody.find('tr'):
line = []
for idx, column in enumerate(row.find('td')):
if idx == COLUMN_EMAIL:
line.append(column.find('a')[0].attrs['href'][len('mailto:'):])
else:
line.append(column.text)
ret.append(','.join(line))
return ret
def parse_root_page(filename):
filepath = get_download_path(filename)
ret = []
with filepath.open() as f:
contents = f.read()
html = HTML(html=contents)
for row in [_ for _ in html.find('table')[0].find('table')][2].find('tr'):
has_link = len(row.find('a')) > 0
if not has_link:
continue
link = row.find('a')[0].attrs['href']
regione = row.text.lower().split('\n')[0] # there are a couple of '\n'
ret.append((regione, link))
return ret
def save_csv(url, name):
path = download_and_save(url, '%s.html' % name)
lines = csv_from_html(path)
with Path('%s.csv' % name).open("w") as f:
for line in lines:
f.write(line)
f.write('\n')
def retrieve_regions():
download_and_save(URL_ROOT_PAGE, 'root.html')
return parse_root_page('root.html')
if __name__ == '__main__':
urls = retrieve_regions()
for region_name, url in urls:
save_csv(url, region_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.