Last active
March 27, 2024 06:18
-
-
Save justmarkham/ddcfec467c67ce4fa4e823fc2df20e79 to your computer and use it in GitHub Desktop.
Data School blog post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "c8b3ccce", | |
"metadata": {}, | |
"source": [ | |
"# [Building a dataset of Python versions with regular expressions](https://www.dataschool.io/web-scraping-with-regex/)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "327aa069", | |
"metadata": {}, | |
"source": [ | |
"[Python Documentation by Version](https://www.python.org/doc/versions/)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "068029f6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "07c1652b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"r = requests.get('https://www.python.org/doc/versions/')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "4872696c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<h1>Python Documentation by Version</h1>\n", | |
"<p>Some previous versions of the documentation remain available\n", | |
"online. Use the list below to select a version to view.</p>\n", | |
"<p>For unreleased (in development) documentation, see\n", | |
"<a class=\"reference internal\" href=\"#in-development-versions\">In Development Versions</a>.</p>\n", | |
"<ul class=\"simple\">\n", | |
"<li><a class=\"reference external\" href=\"https://docs.python.org/release/3.11.2/\">Python 3.11.2</a>, documentation released on 8 February 2023.</li>\n", | |
"<li><a class=\"reference external\" href=\"https://docs.python.org/release/3.11.1/\">Python 3.11.1</a>, documentation released on 6 December 2022.</li>\n", | |
"<li><a class=\"reference external\" href=\"https://docs.python.org/release/3.11.0/\">Python 3.11.0</a>, documentation released on 24 October 2022.</li>\n" | |
] | |
} | |
], | |
"source": [ | |
"print(r.text[21646:22424])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "aaddac57", | |
"metadata": {}, | |
"source": [ | |
"## Extracting the dates" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "bf247f46", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a8d1b84f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['8 February 2023', '6 December 2022', '24 October 2022']" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dates = re.findall(r'\\d+ \\w+ \\d{4}', r.text)\n", | |
"dates[0:3]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1841112d", | |
"metadata": {}, | |
"source": [ | |
"## Extracting the version numbers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "324b89f3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['3.11.2', '3.11.1', '3.11.0']" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"versions = re.findall(r'Python (\\d.+?)<', r.text)\n", | |
"versions[0:3]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4a25c479", | |
"metadata": {}, | |
"source": [ | |
"## Creating the dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "fe1231f4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "6e291c35", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Version</th>\n", | |
" <th>Date</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3.11.2</td>\n", | |
" <td>8 February 2023</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3.11.1</td>\n", | |
" <td>6 December 2022</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3.11.0</td>\n", | |
" <td>24 October 2022</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3.10.10</td>\n", | |
" <td>8 February 2023</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>3.10.9</td>\n", | |
" <td>6 December 2022</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>184</th>\n", | |
" <td>1.5.2</td>\n", | |
" <td>30 April 1999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>185</th>\n", | |
" <td>1.5.1p1</td>\n", | |
" <td>6 August 1998</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>186</th>\n", | |
" <td>1.5.1</td>\n", | |
" <td>14 April 1998</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>187</th>\n", | |
" <td>1.5</td>\n", | |
" <td>17 February 1998</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>188</th>\n", | |
" <td>1.4</td>\n", | |
" <td>25 October 1996</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>189 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Version Date\n", | |
"0 3.11.2 8 February 2023\n", | |
"1 3.11.1 6 December 2022\n", | |
"2 3.11.0 24 October 2022\n", | |
"3 3.10.10 8 February 2023\n", | |
"4 3.10.9 6 December 2022\n", | |
".. ... ...\n", | |
"184 1.5.2 30 April 1999\n", | |
"185 1.5.1p1 6 August 1998\n", | |
"186 1.5.1 14 April 1998\n", | |
"187 1.5 17 February 1998\n", | |
"188 1.4 25 October 1996\n", | |
"\n", | |
"[189 rows x 2 columns]" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.DataFrame(zip(versions, dates), columns=['Version', 'Date'])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment