Created
March 25, 2018 22:19
-
-
Save janfreyberg/1f5765d544f987670c297ab024d86ea8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"import time\n", | |
"from tqdm import tqdm_notebook\n", | |
"import daiquiri\n", | |
"import logging\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"daiquiri.setup(\n", | |
" level=logging.INFO, outputs=(\n", | |
" daiquiri.output.File(\"getting_years.log\"),\n", | |
"))\n", | |
"logger = daiquiri.getLogger('scraping_years')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_section(soup, section='events'):\n", | |
" if soup.find(id=section.title()):\n", | |
" event_header = soup.find(id=section.title()).parent\n", | |
" events = []\n", | |
" for sibling in event_header.next_siblings:\n", | |
" if sibling.name == 'ul':\n", | |
" for item in sibling.find_all('li'):\n", | |
" if item.find_all('li'):\n", | |
" events += [li for li in item.find_all('li')]\n", | |
" else:\n", | |
" events.append(item)\n", | |
"\n", | |
" if sibling.name == 'h2':\n", | |
" break\n", | |
"\n", | |
" return events\n", | |
" else:\n", | |
" return []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_all_sections(year, sections=['events', 'deaths', 'births'], logger=logger):\n", | |
" if year < 0:\n", | |
" url = f'https://en.wikipedia.org/wiki/BC_{abs(year)}'\n", | |
" else:\n", | |
" url = f'https://en.wikipedia.org/wiki/AD_{abs(year)}'\n", | |
"\n", | |
" r = requests.get(url)\n", | |
" if logger is not None:\n", | |
" logger.info(f\"{year}: {r.status_code}\")\n", | |
" if r.status_code != 200:\n", | |
" return {section: [] for section in sections}\n", | |
" else:\n", | |
" soup = BeautifulSoup(r.text, 'lxml')\n", | |
" return {section: get_section(soup, section)\n", | |
" for section in sections}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 108, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"years = {}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 109, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "1ebf782c439b4cccbc2a5b54302237ce", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/html": [ | |
"<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n", | |
"<p>\n", | |
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", | |
" that the widgets JavaScript is still loading. If this message persists, it\n", | |
" likely means that the widgets JavaScript library is either not installed or\n", | |
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n", | |
" Widgets Documentation</a> for setup instructions.\n", | |
"</p>\n", | |
"<p>\n", | |
" If you're reading this message in another frontend (for example, a static\n", | |
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n", | |
" it may mean that your frontend doesn't currently support widgets.\n", | |
"</p>\n" | |
], | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2018), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for year in tqdm_notebook(range(1, 2019)):\n", | |
" years[year] = get_all_sections(year)\n", | |
" time.sleep(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"year_numbers = {\n", | |
" year: {\n", | |
" section: len(items)\n", | |
" for section, items in sections.items()\n", | |
" }\n", | |
" for year, sections in years.items()\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"years_df = pd.DataFrame.from_dict(year_numbers, orient='index')\n", | |
"years_df.index.name = 'year'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"years_df.to_csv('year_metrics.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:Python3]", | |
"language": "python", | |
"name": "conda-env-Python3-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment