Skip to content

Instantly share code, notes, and snippets.

@gipi
Last active July 18, 2019 11:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gipi/97f7c9813878777e50cf5a492f4bf29b to your computer and use it in GitHub Desktop.
Save gipi/97f7c9813878777e50cf5a492f4bf29b to your computer and use it in GitHub Desktop.
Crea dei CSV con l'elenco degli indirizzi PEC dei vari comuni italiani. Dalla pagina principale recupera l'URL delle singole regioni da cui crea il corrispondente CSV.
import re
from pathlib import Path
import os
import sys
import requests
import logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel('INFO')
from requests_html import HTML
COLUMN_EMAIL = 3
URL_ROOT_DOMAIN = 'http://www.comuniverso.it/'
URL_ROOT_PAGE = 'index.cfm?Comuni_con_PEC&menu=271'
DOWNLOADS_FOLDER = 'downloads'
DIR = os.path.dirname(__file__)
get_download_path = lambda x: Path(DIR)/DOWNLOADS_FOLDER/x
def download_and_save(url, filename):
'''download url and save as filename if filename doesn't exist'''
download_dir = Path(DIR)/DOWNLOADS_FOLDER
download_path = download_dir/filename
if not download_dir.exists():
logger.info('creating directory \'%s\'' % download_dir)
download_dir.mkdir(parents=True)
if download_path.exists():
return download_path
logger.info('\'%s\' doesn\'t exist, I\'ll download it using \'%s\'' % (download_path, url))
response = requests.get(URL_ROOT_DOMAIN + url)
if response.status_code != 200:
raise Exception('connection to \'%s\' failed' % url)
with download_path.open("w") as f:
f.write(response.text)
return download_path
def usage(argname):
print("usage: %s" % argname)
sys.exit(1)
def csv_from_html(filepath):
ret = []
with open(filepath) as f:
contents = f.read()
html = HTML(html=contents)
table = html.xpath('/html/body/a/table')[0]
tbody = table.find('tbody')[0]
for row in tbody.find('tr'):
line = []
for idx, column in enumerate(row.find('td')):
if idx == COLUMN_EMAIL:
line.append(column.find('a')[0].attrs['href'][len('mailto:'):])
else:
line.append(column.text)
ret.append(','.join(line))
return ret
def parse_root_page(filename):
filepath = get_download_path(filename)
ret = []
with filepath.open() as f:
contents = f.read()
html = HTML(html=contents)
for row in [_ for _ in html.find('table')[0].find('table')][2].find('tr'):
has_link = len(row.find('a')) > 0
if not has_link:
continue
link = row.find('a')[0].attrs['href']
regione = row.text.lower().split('\n')[0] # there are a couple of '\n'
ret.append((regione, link))
return ret
def save_csv(url, name):
path = download_and_save(url, '%s.html' % name)
lines = csv_from_html(path)
with Path('%s.csv' % name).open("w") as f:
for line in lines:
f.write(line)
f.write('\n')
def retrieve_regions():
download_and_save(URL_ROOT_PAGE, 'root.html')
return parse_root_page('root.html')
if __name__ == '__main__':
urls = retrieve_regions()
for region_name, url in urls:
save_csv(url, region_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment