Created
August 9, 2020 06:23
-
-
Save hayashikun/951c6fba89bde1e984863d62fa22012a to your computer and use it in GitHub Desktop.
Universities in Tokyo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import hashlib\n", | |
"import os\n", | |
"import re\n", | |
"import time\n", | |
"import urllib\n", | |
"\n", | |
"from bs4 import BeautifulSoup\n", | |
"from joblib import Parallel, delayed\n", | |
"from tqdm import tqdm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def request(url, wait_request=0):\n", | |
" h = hashlib.sha1(url.encode()).hexdigest()\n", | |
" f = f\"cache/{h}\"\n", | |
" if os.path.exists(f):\n", | |
" with open(f) as fp:\n", | |
" return fp.read()\n", | |
"\n", | |
" if not os.path.exists(\"cache\"):\n", | |
" os.mkdir(\"cache\")\n", | |
" time.sleep(wait_request)\n", | |
" with urllib.request.urlopen(url) as response:\n", | |
" text = response.read().decode()\n", | |
" with open(f, \"w\") as fp:\n", | |
" fp.write(text)\n", | |
" return text\n", | |
"\n", | |
"\n", | |
"# request(\"https://ja.wikipedia.org\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"base = \"https://ja.wikipedia.org\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Manual add: ['日本大学']\n" | |
] | |
} | |
], | |
"source": [ | |
"universities = dict()\n", | |
"html = request(\n", | |
" f\"{base}/wiki/%E6%9D%B1%E6%97%A5%E6%9C%AC%E3%81%AE%E5%A4%A7%E5%AD%A6%E4%B8%80%E8%A6%A7\"\n", | |
")\n", | |
"soup = BeautifulSoup(html, \"html.parser\")\n", | |
"\n", | |
"invalid = dict()\n", | |
"\n", | |
"h3 = soup.find(id=\"東京都\").find_parent()\n", | |
"div = h3.find_next_sibling()\n", | |
"for ul in div.find_all(\"ul\"):\n", | |
" for li in ul.find_all(\"li\"):\n", | |
" a = li.find(\"a\").attrs\n", | |
" name = a[\"title\"]\n", | |
" if name.endswith(\"キャンパス\") or name.endswith(\"学部\") or name.endswith(\"大学院\"):\n", | |
" invalid[name] = a[\"href\"]\n", | |
" else:\n", | |
" universities[name] = a[\"href\"]\n", | |
"\n", | |
"reg = re.compile(\".+?大学\")\n", | |
"print(\n", | |
" \"Manual add:\",\n", | |
" [\n", | |
" name\n", | |
" for name in set([reg.search(name).group() for name in invalid.keys()])\n", | |
" if name not in universities\n", | |
" ],\n", | |
")\n", | |
"\n", | |
"universities[\"日本大学\"] = \"/wiki/%E6%97%A5%E6%9C%AC%E5%A4%A7%E5%AD%A6\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 182/182 [00:00<00:00, 1615.12it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"for url in tqdm(universities.values()):\n", | |
" request(f\"{base}/{url}\", wait_request=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Manual add: ['多摩美術大学']\n" | |
] | |
} | |
], | |
"source": [ | |
"locations = dict()\n", | |
"\n", | |
"reg = re.compile(r\"東京都(.+?[区市])\")\n", | |
"\n", | |
"\n", | |
"def search_location(url):\n", | |
" html = request(f\"{base}/{url}\")\n", | |
" soup = BeautifulSoup(html, \"html.parser\")\n", | |
" info = soup.find(class_=\"infobox\")\n", | |
" td = info.find(\"th\", text=\"本部所在地\").find_next_sibling()\n", | |
" m = reg.search(td.text)\n", | |
" if m is not None:\n", | |
" return m.groups()[0]\n", | |
"\n", | |
"\n", | |
"def task(name, url):\n", | |
" try:\n", | |
" return name, search_location(url)\n", | |
" except Exception as e:\n", | |
" print(name, e)\n", | |
" return name, \"\"\n", | |
"\n", | |
"\n", | |
"processed = Parallel(n_jobs=-1)([delayed(task)(*p) for p in universities.items()])\n", | |
"locations = dict(processed)\n", | |
"print(\n", | |
" \"Manual add:\", [k for k, v in locations.items() if v == \"\"],\n", | |
")\n", | |
"\n", | |
"locations[\"多摩美術大学\"] = \"世田谷区\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wards = {k: list() for k in set(locations.values()) if k is not None}\n", | |
"for k, v in locations.items():\n", | |
" if v is not None:\n", | |
" wards[v].append(k)\n", | |
"\n", | |
"counts = {k: len(v) for k, v in wards.items()}\n", | |
"counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total\t137\n", | |
"----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", | |
"千代田区\t14\t大妻女子大学,大原大学院大学,共立女子大学,グロービス経営大学院大学,上智大学,専修大学,デジタルハリウッド大学,二松學舍大学,日本教育大学院大学,日本歯科大学,ビジネス・ブレークスルー大学,法政大学,明治大学,日本大学\n", | |
"文京区\t12\tお茶の水女子大学,東京大学,東京医科歯科大学,跡見学園女子大学,国際仏教学大学院大学,順天堂大学,拓殖大学,東洋大学,東洋学園大学,日本医科大学,日本女子大学,文京学院大学\n", | |
"世田谷区\t10\t国士舘大学,駒澤大学,産業能率大学,昭和女子大学,成城大学,多摩美術大学,東京都市大学,東京農業大学,日本女子体育大学,日本体育大学\n", | |
"港区\t9\t政策研究大学院大学,東京海洋大学,北里大学,慶應義塾大学,事業構想大学院大学,芝浦工業大学,東京慈恵会医科大学,ハリウッド大学院大学,明治学院大学\n", | |
"八王子市\t9\t東京都立大学 (2020-),創価大学,中央大学,東京工科大学,東京純心大学,東京造形大学,東京薬科大学,日本文化大学,ヤマザキ学園大学\n", | |
"新宿区\t8\t学習院女子大学,工学院大学,東京医科大学,東京女子医科大学,東京富士大学,東京理科大学,目白大学,早稲田大学\n", | |
"渋谷区\t7\t青山学院大学,國學院大學,聖心女子大学,東海大学,日本赤十字看護大学,文化学園大学,文化ファッション大学院大学\n", | |
"品川区\t7\t産業技術大学院大学,昭和大学,杉野服飾大学,清泉女子大学,東京医療保健大学,星薬科大学,立正大学\n", | |
"町田市\t5\t桜美林大学,昭和薬科大学,玉川大学,東京家政学院大学,和光大学\n", | |
"三鷹市\t4\t杏林大学,国際基督教大学,東京神学大学,ルーテル学院大学\n", | |
"小平市\t4\t嘉悦大学,白梅学園大学,津田塾大学,武蔵野美術大学\n", | |
"豊島区\t4\t学習院大学,大正大学,帝京平成大学,立教大学\n", | |
"多摩市\t3\t恵泉女学園大学,多摩大学,東京医療学院大学\n", | |
"板橋区\t3\t大東文化大学,帝京大学,東京家政大学\n", | |
"調布市\t3\t電気通信大学,白百合女子大学,桐朋学園大学\n", | |
"武蔵野市\t3\t亜細亜大学,成蹊大学,日本獣医生命科学大学\n", | |
"足立区\t3\t帝京科学大学,東京電機大学,東京未来大学\n", | |
"杉並区\t3\t女子美術大学,高千穂大学,東京女子大学\n", | |
"江東区\t2\t東京有明医療大学,武蔵野大学\n", | |
"練馬区\t2\t武蔵大学,武蔵野音楽大学\n", | |
"台東区\t2\t東京芸術大学,上野学園大学\n", | |
"目黒区\t2\t東京工業大学,東京音楽大学\n", | |
"府中市\t2\t東京外国語大学,東京農工大学\n", | |
"中野区\t2\tこども教育宝仙大学,東京工芸大学\n", | |
"国立市\t2\t一橋大学,東京女子体育大学\n", | |
"日野市\t2\t実践女子大学,明星大学\n", | |
"清瀬市\t2\t日本社会事業大学,明治薬科大学\n", | |
"葛飾区\t1\t東京聖栄大学\n", | |
"立川市\t1\t国立音楽大学\n", | |
"小金井市\t1\t東京学芸大学\n", | |
"大田区\t1\t東邦大学\n", | |
"稲城市\t1\t駒沢女子大学\n", | |
"中央区\t1\t聖路加国際大学\n", | |
"国分寺市\t1\t東京経済大学\n", | |
"北区\t1\t東京成徳大学\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"Total\\t{sum(counts.values())}\")\n", | |
"print(\"-\" * 190)\n", | |
"\n", | |
"for name in counts.keys():\n", | |
" print(f\"{name}\\t{counts[name]}\\t{','.join(wards[name])}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment