Skip to content

Instantly share code, notes, and snippets.

@flxai
Created November 21, 2020 00:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flxai/a2c1827565a9d8b2512021a34720fa4f to your computer and use it in GitHub Desktop.
Save flxai/a2c1827565a9d8b2512021a34720fa4f to your computer and use it in GitHub Desktop.
Download URL, select table and have interactive IPython shell with defined DataFrame
#!/usr/bin/env python
# dfget gets URL and interprets interactively selected table as Pandas DataFrame
import inquirer
import requests
import sys
import IPython
import pandas as pd
from bs4 import BeautifulSoup
MAX_ROWS_PREVIEW = 5
def bail(msg=None, code=1):
if msg is not None:
print(msg)
sys.exit(code)
def print_usage():
print("Usage: dfget URL")
if len(sys.argv) <= 1:
print('Not enough parameters')
print_usage()
bail()
url = sys.argv[1]
# Download HTML from given URL using a common user agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=headers)
status_code = r.status_code
status_text = requests.status_codes._codes[status_code][0].replace('_', ' ').upper()
if status_code != 200:
bail(f'{status_code} {status_text}: Error downloading {url}')
# If multiple tables available, select interactively
soup = BeautifulSoup(r.text, features="lxml")
tables = soup.select('table')
# Terse preview for easier selection
max_rows = pd.get_option('display.max_rows')
pd.set_option('display.max_rows', MAX_ROWS_PREVIEW)
if len(tables) == 1:
table_id = 0
else:
with pd.option_context('display.colheader_justify','left'):
choices = [
str(pd.read_html(str(table[1].encode()))) + '\n'
for table in enumerate(tables)
]
choices = [
choice.replace('\n', '\n ')
for choice in choices
]
questions = [
inquirer.List('table', message="What table do you want to select?", choices=choices),
]
table_id = choices.index(inquirer.prompt(questions)['table'])
pd.set_option('display.max_rows', max_rows)
table = tables[table_id]
df = pd.read_html(table.encode())[0]
# Start interactive session
IPython.start_ipython(user_ns={'df': df.copy()}, argv=[])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment