Skip to content

Instantly share code, notes, and snippets.

@janfreyberg
Created March 25, 2018 22:19
Show Gist options
  • Save janfreyberg/1f5765d544f987670c297ab024d86ea8 to your computer and use it in GitHub Desktop.
Save janfreyberg/1f5765d544f987670c297ab024d86ea8 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"from tqdm import tqdm_notebook\n",
"import daiquiri\n",
"import logging\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"daiquiri.setup(\n",
" level=logging.INFO, outputs=(\n",
" daiquiri.output.File(\"getting_years.log\"),\n",
"))\n",
"logger = daiquiri.getLogger('scraping_years')\n"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"def get_section(soup, section='events'):\n",
" if soup.find(id=section.title()):\n",
" event_header = soup.find(id=section.title()).parent\n",
" events = []\n",
" for sibling in event_header.next_siblings:\n",
" if sibling.name == 'ul':\n",
" for item in sibling.find_all('li'):\n",
" if item.find_all('li'):\n",
" events += [li for li in item.find_all('li')]\n",
" else:\n",
" events.append(item)\n",
"\n",
" if sibling.name == 'h2':\n",
" break\n",
"\n",
" return events\n",
" else:\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"def get_all_sections(year, sections=['events', 'deaths', 'births'], logger=logger):\n",
" if year < 0:\n",
" url = f'https://en.wikipedia.org/wiki/BC_{abs(year)}'\n",
" else:\n",
" url = f'https://en.wikipedia.org/wiki/AD_{abs(year)}'\n",
"\n",
" r = requests.get(url)\n",
" if logger is not None:\n",
" logger.info(f\"{year}: {r.status_code}\")\n",
" if r.status_code != 200:\n",
" return {section: [] for section in sections}\n",
" else:\n",
" soup = BeautifulSoup(r.text, 'lxml')\n",
" return {section: get_section(soup, section)\n",
" for section in sections}"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"years = {}"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1ebf782c439b4cccbc2a5b54302237ce",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2018), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"for year in tqdm_notebook(range(1, 2019)):\n",
" years[year] = get_all_sections(year)\n",
" time.sleep(3)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"year_numbers = {\n",
" year: {\n",
" section: len(items)\n",
" for section, items in sections.items()\n",
" }\n",
" for year, sections in years.items()\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"years_df = pd.DataFrame.from_dict(year_numbers, orient='index')\n",
"years_df.index.name = 'year'"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"years_df.to_csv('year_metrics.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:Python3]",
"language": "python",
"name": "conda-env-Python3-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment