Skip to content

Instantly share code, notes, and snippets.

@robsalasco
Created March 17, 2020 22:53
Show Gist options
  • Save robsalasco/8626f6efdc8589793557f4519d20d41f to your computer and use it in GitHub Desktop.
Save robsalasco/8626f6efdc8589793557f4519d20d41f to your computer and use it in GitHub Desktop.
from requests import get
from lxml import html, etree
import camelot
import pandas as pd
import numpy as np
url = 'https://www.minsal.cl/nuevo-coronavirus-2019-ncov/casos-confirmados-en-chile-covid-19/'
response = get(url)
file = response.content
#with open(r'/Users/robsalasco/Downloads/covid.html', "r") as f:
# file = f.read()
source_code = html.fromstring(file)
tree = source_code.xpath('//*[@id="main"]/div[2]/div[5]/div/table[1]')[0]
info = [[row.xpath(".//td/text()")[0], row.xpath(".//td/a/@href")[0]] for row in tree.xpath(".//tr")[1:]]
tables = camelot.read_pdf(info[0][1])
#tables = camelot.read_pdf("/Users/robsalasco/Downloads/2020-03-17-Casos-confirmados.pdf")
tablespd = tables[0].df
for x in [0,1,4]:
tablespd[x][tablespd[x]==""] = None
tablespd[x] = tablespd[x].fillna(method='ffill')
tablespd[x] = tablespd[x].str.replace(r'\n', ' ').str.replace(r' ', ' ')
#for x in [2,3,4]:
# tablespd[x][tablespd[x]=="—"] = None
tablespd.columns = tablespd.iloc[1]
tablespd.drop(tablespd.index[:2 ], inplace=True)
tablespd = tablespd.reset_index()
tablespd['Región'].replace("Metropolitan a", "Metropolitana", inplace=True)
fix1 = tablespd[tablespd['Edad'].str.contains('\n', na=False)]['Centro de salud'].to_list()
fix2 = tablespd[tablespd['Edad'].str.contains('\n', na=False)]['Región'].to_list()
fix3 = tablespd[tablespd['Edad'].str.contains('\n', na=False)]['Casos confirmados'].to_list()
var1 = tablespd[tablespd['Sexo'].str.contains('\n', na=False)].Sexo.str.split('\n').to_list()[0]
var2 = tablespd[tablespd['Edad'].str.contains('\n', na=False)].Edad.str.split('\n').to_list()[0]
fixedRows = pd.DataFrame(list(zip(fix2 * len(var2),fix3 * len(var2),var1,var2,fix1 * len(var1))),
columns=['Región','Casos confirmados','Sexo','Edad','Centro de salud'])
tablespd.drop(tablespd[tablespd['Edad'].str.contains('\n', na=False)].index, inplace=True)
tablespd = pd.concat([tablespd,fixedRows]).reset_index().drop(['index','level_0'], axis=1)
#tablespd.columns = tablespd.iloc[1]
#tablespd.drop(tablespd.index[:2 ], inplace=True)
#
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#
#print(tablespd)
print(tablespd.to_json(orient='records',force_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment