Skip to content

Instantly share code, notes, and snippets.

@robsalasco
Created March 16, 2020 21:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robsalasco/89ec1f6c4ab34d6e566650427e464e94 to your computer and use it in GitHub Desktop.
Save robsalasco/89ec1f6c4ab34d6e566650427e464e94 to your computer and use it in GitHub Desktop.
from requests import get
from lxml import html, etree
import camelot
import pandas as pd
import numpy as np
url = 'https://www.minsal.cl/nuevo-coronavirus-2019-ncov/casos-confirmados-en-chile-covid-19/'
response = get(url)
file = response.content
#with open(r'/Users/robsalasco/Downloads/covid.html', "r") as f:
# file = f.read()
source_code = html.fromstring(file)
tree = source_code.xpath('//*[@id="main"]/div[2]/div[5]/div/table[1]')[0]
info = [[row.xpath(".//td/text()")[0], row.xpath(".//td/a/@href")[0]] for row in tree.xpath(".//tr")[1:]]
tables = camelot.read_pdf(info[0][1])
tablespd = tables[0].df
for x in [0,1,4]:
tablespd[x][tablespd[x]==""] = np.NaN
tablespd[x] = tablespd[x].fillna(method='ffill')
tablespd[x] = tablespd[x].str.replace(r'\n', ' ').str.replace(r' ', ' ')
tablespd.columns = tablespd.iloc[1]
tablespd.drop(tablespd.index[:2 ], inplace=True)
pd.set_option('display.max_rows', None)
print(tablespd.to_json(orient='records',force_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment