Skip to content

Instantly share code, notes, and snippets.

@vol1ura
Last active December 26, 2021 09:56
Show Gist options
  • Save vol1ura/722bbe3cd9dadfcaac9e778cd2055a70 to your computer and use it in GitHub Desktop.
Save vol1ura/722bbe3cd9dadfcaac9e778cd2055a70 to your computer and use it in GitHub Desktop.
clubmates.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "clubmates.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyMSbfzztz2IMxYL6dDXQJeG",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/vol1ura/722bbe3cd9dadfcaac9e778cd2055a70/clubmates.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zRg0J0r6ztxV"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import random\n",
"import re\n",
"import requests\n",
"import time\n",
"from tqdm.notebook import tqdm\n",
"\n",
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"source": [
"!pip install random_user_agent"
],
"metadata": {
"id": "Z61g9j-HFebY",
"outputId": "3c9ecf80-d679-4e10-b42c-67e33d268940",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: random_user_agent in /usr/local/lib/python3.7/dist-packages (1.0.1)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from random_user_agent.user_agent import UserAgent\n",
"\n",
"user_agent_rotator = UserAgent()"
],
"metadata": {
"id": "bZnw6iEPFy36"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"club_id = 23212 # Wake&Run\n",
"home_parkrun = 'Kuzminki'"
],
"metadata": {
"id": "clo0v2xzzyLU"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"headers = {\n",
" 'Host': 'www.parkrun.ru',\n",
" 'User-Agent': user_agent_rotator.get_random_user_agent(),\n",
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',\n",
" 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',\n",
" 'Accept-Encoding': 'gzip, deflate, br',\n",
" 'Connection': 'keep-alive',\n",
" 'Upgrade-Insecure-Requests': '1',\n",
" 'Sec-GPC': '1',\n",
" 'TE': 'Trailers'\n",
" }"
],
"metadata": {
"id": "JMntqd19z4bw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"page_all_results = requests.get('https://www.parkrun.ru/results/courserecords/', headers=headers)\n",
"data = pd.read_html(page_all_results.text)[0]\n",
"russian_parkruns = data[data.columns[0]]"
],
"metadata": {
"id": "ctIMuJuVz7T9",
"outputId": "29d2ffd0-250d-40e1-a1f9-a2edb2e98511",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 348
}
},
"execution_count": null,
"outputs": [
{
"output_type": "error",
"ename": "ValueError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-58-7fb0c7ed9957>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mpage_all_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'https://www.parkrun.ru/results/courserecords/'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_html\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpage_all_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mrussian_parkruns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 294\u001b[0m )\n\u001b[1;32m 295\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFutureWarning\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstacklevel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36mread_html\u001b[0;34m(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)\u001b[0m\n\u001b[1;32m 1099\u001b[0m \u001b[0mna_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mna_values\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1100\u001b[0m \u001b[0mkeep_default_na\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkeep_default_na\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1101\u001b[0;31m \u001b[0mdisplayed_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdisplayed_only\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1102\u001b[0m )\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse\u001b[0;34m(flavor, io, match, attrs, encoding, displayed_only, **kwargs)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 916\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 917\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mretained\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse\u001b[0;34m(flavor, io, match, attrs, encoding, displayed_only, **kwargs)\u001b[0m\n\u001b[1;32m 896\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 898\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 899\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcaught\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0;31m# if `io` is an io-like object, check if it's seekable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36mparse_tables\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mof\u001b[0m \u001b[0mparsed\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfooter\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0mtuples\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \"\"\"\n\u001b[0;32m--> 217\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_doc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 218\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_thead_tbody_tfoot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtable\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse_tables\u001b[0;34m(self, doc, match, attrs)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No tables found\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: No tables found"
]
}
]
},
{
"cell_type": "code",
"source": [
"dfs = []\n",
"for parkrun in tqdm(russian_parkruns):\n",
" time.sleep(3 + 5*random.random())\n",
" parkrun_trim = re.sub(r'[\\s-]', '', parkrun)\n",
" url = f'https://www.parkrun.ru/{parkrun_trim}/results/clubhistory/?clubNum={club_id}'\n",
" headers['User-Agent'] = user_agent_rotator.get_random_user_agent()\n",
" club_results = requests.get(url, headers=headers)\n",
" try:\n",
" df = pd.read_html(club_results.text)[0]\n",
" dfs.append(df[df.columns[0]])\n",
" if parkrun == home_parkrun:\n",
" home_list = df[df.columns[0]]\n",
" except:\n",
" print('ОШИБКА - операция завершилась досрочно. Паркран временно заблокировал IP.')\n",
" break"
],
"metadata": {
"id": "pB86PSu40JiI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def last_name_first(full_name: str):\n",
" names = full_name.split()\n",
" last_name = names.pop(1).capitalize()\n",
" names.insert(0, last_name)\n",
" return ' '.join(names)"
],
"metadata": {
"id": "JGv4TY_5LOho"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"full_list = pd.concat(dfs).drop_duplicates(keep='last')\n",
"outer_home_list = pd.concat([full_list, home_list]).apply(last_name_first).sort_values().drop_duplicates(keep=False).reset_index(drop=True)"
],
"metadata": {
"id": "u-ij29bnAOaq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Одноклубники, не бегавшие на домашнем паркране:"
],
"metadata": {
"id": "QmpqdvUI8Hsx"
}
},
{
"cell_type": "code",
"source": [
"outer_home_list.shift(1, fill_value='________Фамилия_Имя___')"
],
"metadata": {
"id": "526myT8CFL74"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Полный список одноклубников:"
],
"metadata": {
"id": "5Ka2TdEp8a4_"
}
},
{
"cell_type": "code",
"source": [
"full_list.apply(last_name_first).sort_values().reset_index(drop=True).shift(1, fill_value='________Фамилия_Имя___')"
],
"metadata": {
"id": "ImhNvi-aS6I_"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment