Skip to content

Instantly share code, notes, and snippets.

@robsalasco
Created Mar 18, 2020
Embed
What would you like to do?
from requests import get
from lxml import html, etree
import camelot
import pandas as pd
import numpy as np
url = 'https://www.minsal.cl/nuevo-coronavirus-2019-ncov/casos-confirmados-en-chile-covid-19/'
response = get(url)
file = response.content
#with open(r'/Users/robsalasco/Downloads/covid.html', "r") as f:
# file = f.read()
source_code = html.fromstring(file)
tree = source_code.xpath('//*[@id="main"]/div[2]/div[5]/div/table[1]')[0]
info = [[row.xpath(".//td/text()")[0], row.xpath(".//td/a/@href")[0]] for row in tree.xpath(".//tr")[1:]]
tables = camelot.read_pdf(info[0][1])
#tables = camelot.read_pdf("/Users/robsalasco/Downloads/2020-03-17-Casos-confirmados.pdf")
tablespd = tables[0].df
for x in [0,1,4]:
tablespd[x][tablespd[x]==""] = None
tablespd[x] = tablespd[x].fillna(method='ffill')
tablespd[x] = tablespd[x].str.replace(r'\n', ' ').str.replace(r' ', ' ')
#for x in [2,3,4]:
# tablespd[x][tablespd[x]=="—"] = None
tablespd.columns = tablespd.iloc[1]
tablespd.drop(tablespd.index[:2 ], inplace=True)
tablespd = tablespd.reset_index()
tablespd['Región'].replace("Metropolitan a", "Metropolitana", inplace=True)
#
#fix1 = tablespd[tablespd['Edad'].str.contains('\n', na=False)]['Centro de salud'].to_list()
#fix2 = tablespd[tablespd['Edad'].str.contains('\n', na=False)]['Región'].to_list()
#fix3 = tablespd[tablespd['Edad'].str.contains('\n', na=False)]['Casos confirmados'].to_list()
#
#var1 = tablespd[tablespd['Sexo'].str.contains('\n', na=False)].Sexo.str.split('\n').to_list()[0]
#var2 = tablespd[tablespd['Edad'].str.contains('\n', na=False)].Edad.str.split('\n').to_list()[0]
#
#fixedRows = pd.DataFrame(list(zip(fix2 * len(var2),fix3 * len(var2),var1,var2,fix1 * len(var1))),
# columns=['Región','Casos confirmados','Sexo','Edad','Centro de salud'])
#tablespd.drop(tablespd[tablespd['Edad'].str.contains('\n', na=False)].index, inplace=True)
#
#tablespd = pd.concat([tablespd,fixedRows]).reset_index().drop(['index','level_0'], axis=1)
#tablespd.columns = tablespd.iloc[1]
#tablespd.drop(tablespd.index[:2 ], inplace=True)
#
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#
#print(tablespd)
print(tablespd.to_json(orient='records',force_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment