Skip to content

Instantly share code, notes, and snippets.

@eisenjulian
Created October 20, 2020 22:34
Show Gist options
  • Save eisenjulian/81dd3a1d7e2e0ea9c594082f5b737506 to your computer and use it in GitHub Desktop.
Save eisenjulian/81dd3a1d7e2e0ea9c594082f5b737506 to your computer and use it in GitHub Desktop.
import pdfminer.high_level
import datetime
import requests
import sys
import os
import re
import unidecode
import collections
def split(delimiters, string, maxsplit=0):
regexPattern = '|'.join(map(re.escape, delimiters))
return re.split(regexPattern, string, maxsplit)
provinces = [
"Buenos Aires",
"CABA",
"Catamarca",
"Chaco",
"Chubut",
"Córdoba",
"Corrientes",
"Entre Ríos",
"Formosa",
"Jujuy",
"La Pampa",
"La Rioja",
"Mendoza",
"Misiones",
"Neuquén",
"Río Negro",
"Salta",
"San Juan",
"San Luis",
"Santa Cruz",
"Santa Fe",
"Santiago del Estero",
"Tierra del Fuego",
"Tucumán"
]
def get_cases(rows):
cases = collections.Counter()
for row in rows:
row_split = unidecode.unidecode(
row.replace('.', '').replace('*', '')).lower().split()
if '|' not in row_split:
continue
index = row_split.index('|')
if not row_split[index-1].isnumeric():
continue
if not row_split[index+1].isnumeric():
continue
if "ciudad" in row_split or "caba" in row_split:
province = "caba"
else:
province = ' '.join(row_split[:index-1])
cases[province] += int(row_split[index-1])
return {province: cases[unidecode.unidecode(province.lower())] for province in provinces}
def get_deaths(rows):
deaths = collections.Counter()
for row in rows:
row_split = row.lower().split()
if len(row_split) <= 5:
continue
if row_split[1] not in ("residentes", "residente"):
continue
if not row_split[0].isnumeric():
continue
if "ciudad" in row_split or "caba" in row_split:
province = "caba"
else:
province = unidecode.unidecode(' '.join(row_split[6:]))
deaths[province] += int(row_split[0])
return {province: deaths[unidecode.unidecode(province.lower())] for province in provinces}
def run():
if len(sys.argv) > 2:
print('Usage: pytohn crawl_report 19-10-20')
return
elif len(sys.argv) == 2:
formated_date = sys.argv[1]
else:
formated_date = datetime.datetime.now().strftime('%d-%m-%y')
filename = f'report-{formated_date}.pdf'
if os.path.exists(filename):
print('File already exists')
else:
url = f'https://www.argentina.gob.ar/sites/default/files/{formated_date}-reporte-vespertino-covid-19.pdf'
print('Fetching url', url)
result = requests.get(url)
if not result.ok:
print('Cound not fetch report')
return
with open(filename, 'wb') as outfile:
outfile.write(result.content)
report = pdfminer.high_level.extract_text(filename).replace(' ', '. ').replace('\n', '. ')
delimiters = "; y ", ", y ", "; ", ", ", ". ", "- ", " - ", " *"
rows = split(delimiters, report)
deaths = get_deaths(rows)
cases = get_cases(rows)
print("total deaths", sum(deaths.values()))
print(deaths)
print("total cases", sum(cases.values()))
print(cases)
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment