Skip to content

Instantly share code, notes, and snippets.

Created Mar 16, 2020
What would you like to do?
from requests import get
from lxml import html, etree
import camelot
import pandas as pd
import numpy as np
url = ''
response = get(url)
file = response.content
#with open(r'/Users/robsalasco/Downloads/covid.html', "r") as f:
# file =
source_code = html.fromstring(file)
tree = source_code.xpath('//*[@id="main"]/div[2]/div[5]/div/table[1]')[0]
info = [[row.xpath(".//td/text()")[0], row.xpath(".//td/a/@href")[0]] for row in tree.xpath(".//tr")[1:]]
tables = camelot.read_pdf(info[0][1])
tablespd = tables[0].df
for x in [0,1,4]:
tablespd[x][tablespd[x]==""] = np.NaN
tablespd[x] = tablespd[x].fillna(method='ffill')
tablespd[x] = tablespd[x].str.replace(r'\n', ' ').str.replace(r' ', ' ')
tablespd.columns = tablespd.iloc[1]
tablespd.drop(tablespd.index[:2 ], inplace=True)
pd.set_option('display.max_rows', None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment