Skip to content

Instantly share code, notes, and snippets.

@hayashikun
Created August 9, 2020 06:23
Show Gist options
  • Save hayashikun/951c6fba89bde1e984863d62fa22012a to your computer and use it in GitHub Desktop.
Save hayashikun/951c6fba89bde1e984863d62fa22012a to your computer and use it in GitHub Desktop.
Universities in Tokyo
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import hashlib\n",
"import os\n",
"import re\n",
"import time\n",
"import urllib\n",
"\n",
"from bs4 import BeautifulSoup\n",
"from joblib import Parallel, delayed\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def request(url, wait_request=0):\n",
" h = hashlib.sha1(url.encode()).hexdigest()\n",
" f = f\"cache/{h}\"\n",
" if os.path.exists(f):\n",
" with open(f) as fp:\n",
" return fp.read()\n",
"\n",
" if not os.path.exists(\"cache\"):\n",
" os.mkdir(\"cache\")\n",
" time.sleep(wait_request)\n",
" with urllib.request.urlopen(url) as response:\n",
" text = response.read().decode()\n",
" with open(f, \"w\") as fp:\n",
" fp.write(text)\n",
" return text\n",
"\n",
"\n",
"# request(\"https://ja.wikipedia.org\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"base = \"https://ja.wikipedia.org\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Manual add: ['日本大学']\n"
]
}
],
"source": [
"universities = dict()\n",
"html = request(\n",
" f\"{base}/wiki/%E6%9D%B1%E6%97%A5%E6%9C%AC%E3%81%AE%E5%A4%A7%E5%AD%A6%E4%B8%80%E8%A6%A7\"\n",
")\n",
"soup = BeautifulSoup(html, \"html.parser\")\n",
"\n",
"invalid = dict()\n",
"\n",
"h3 = soup.find(id=\"東京都\").find_parent()\n",
"div = h3.find_next_sibling()\n",
"for ul in div.find_all(\"ul\"):\n",
" for li in ul.find_all(\"li\"):\n",
" a = li.find(\"a\").attrs\n",
" name = a[\"title\"]\n",
" if name.endswith(\"キャンパス\") or name.endswith(\"学部\") or name.endswith(\"大学院\"):\n",
" invalid[name] = a[\"href\"]\n",
" else:\n",
" universities[name] = a[\"href\"]\n",
"\n",
"reg = re.compile(\".+?大学\")\n",
"print(\n",
" \"Manual add:\",\n",
" [\n",
" name\n",
" for name in set([reg.search(name).group() for name in invalid.keys()])\n",
" if name not in universities\n",
" ],\n",
")\n",
"\n",
"universities[\"日本大学\"] = \"/wiki/%E6%97%A5%E6%9C%AC%E5%A4%A7%E5%AD%A6\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 182/182 [00:00<00:00, 1615.12it/s]\n"
]
}
],
"source": [
"for url in tqdm(universities.values()):\n",
" request(f\"{base}/{url}\", wait_request=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Manual add: ['多摩美術大学']\n"
]
}
],
"source": [
"locations = dict()\n",
"\n",
"reg = re.compile(r\"東京都(.+?[区市])\")\n",
"\n",
"\n",
"def search_location(url):\n",
" html = request(f\"{base}/{url}\")\n",
" soup = BeautifulSoup(html, \"html.parser\")\n",
" info = soup.find(class_=\"infobox\")\n",
" td = info.find(\"th\", text=\"本部所在地\").find_next_sibling()\n",
" m = reg.search(td.text)\n",
" if m is not None:\n",
" return m.groups()[0]\n",
"\n",
"\n",
"def task(name, url):\n",
" try:\n",
" return name, search_location(url)\n",
" except Exception as e:\n",
" print(name, e)\n",
" return name, \"\"\n",
"\n",
"\n",
"processed = Parallel(n_jobs=-1)([delayed(task)(*p) for p in universities.items()])\n",
"locations = dict(processed)\n",
"print(\n",
" \"Manual add:\", [k for k, v in locations.items() if v == \"\"],\n",
")\n",
"\n",
"locations[\"多摩美術大学\"] = \"世田谷区\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"wards = {k: list() for k in set(locations.values()) if k is not None}\n",
"for k, v in locations.items():\n",
" if v is not None:\n",
" wards[v].append(k)\n",
"\n",
"counts = {k: len(v) for k, v in wards.items()}\n",
"counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total\t137\n",
"----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
"千代田区\t14\t大妻女子大学,大原大学院大学,共立女子大学,グロービス経営大学院大学,上智大学,専修大学,デジタルハリウッド大学,二松學舍大学,日本教育大学院大学,日本歯科大学,ビジネス・ブレークスルー大学,法政大学,明治大学,日本大学\n",
"文京区\t12\tお茶の水女子大学,東京大学,東京医科歯科大学,跡見学園女子大学,国際仏教学大学院大学,順天堂大学,拓殖大学,東洋大学,東洋学園大学,日本医科大学,日本女子大学,文京学院大学\n",
"世田谷区\t10\t国士舘大学,駒澤大学,産業能率大学,昭和女子大学,成城大学,多摩美術大学,東京都市大学,東京農業大学,日本女子体育大学,日本体育大学\n",
"港区\t9\t政策研究大学院大学,東京海洋大学,北里大学,慶應義塾大学,事業構想大学院大学,芝浦工業大学,東京慈恵会医科大学,ハリウッド大学院大学,明治学院大学\n",
"八王子市\t9\t東京都立大学 (2020-),創価大学,中央大学,東京工科大学,東京純心大学,東京造形大学,東京薬科大学,日本文化大学,ヤマザキ学園大学\n",
"新宿区\t8\t学習院女子大学,工学院大学,東京医科大学,東京女子医科大学,東京富士大学,東京理科大学,目白大学,早稲田大学\n",
"渋谷区\t7\t青山学院大学,國學院大學,聖心女子大学,東海大学,日本赤十字看護大学,文化学園大学,文化ファッション大学院大学\n",
"品川区\t7\t産業技術大学院大学,昭和大学,杉野服飾大学,清泉女子大学,東京医療保健大学,星薬科大学,立正大学\n",
"町田市\t5\t桜美林大学,昭和薬科大学,玉川大学,東京家政学院大学,和光大学\n",
"三鷹市\t4\t杏林大学,国際基督教大学,東京神学大学,ルーテル学院大学\n",
"小平市\t4\t嘉悦大学,白梅学園大学,津田塾大学,武蔵野美術大学\n",
"豊島区\t4\t学習院大学,大正大学,帝京平成大学,立教大学\n",
"多摩市\t3\t恵泉女学園大学,多摩大学,東京医療学院大学\n",
"板橋区\t3\t大東文化大学,帝京大学,東京家政大学\n",
"調布市\t3\t電気通信大学,白百合女子大学,桐朋学園大学\n",
"武蔵野市\t3\t亜細亜大学,成蹊大学,日本獣医生命科学大学\n",
"足立区\t3\t帝京科学大学,東京電機大学,東京未来大学\n",
"杉並区\t3\t女子美術大学,高千穂大学,東京女子大学\n",
"江東区\t2\t東京有明医療大学,武蔵野大学\n",
"練馬区\t2\t武蔵大学,武蔵野音楽大学\n",
"台東区\t2\t東京芸術大学,上野学園大学\n",
"目黒区\t2\t東京工業大学,東京音楽大学\n",
"府中市\t2\t東京外国語大学,東京農工大学\n",
"中野区\t2\tこども教育宝仙大学,東京工芸大学\n",
"国立市\t2\t一橋大学,東京女子体育大学\n",
"日野市\t2\t実践女子大学,明星大学\n",
"清瀬市\t2\t日本社会事業大学,明治薬科大学\n",
"葛飾区\t1\t東京聖栄大学\n",
"立川市\t1\t国立音楽大学\n",
"小金井市\t1\t東京学芸大学\n",
"大田区\t1\t東邦大学\n",
"稲城市\t1\t駒沢女子大学\n",
"中央区\t1\t聖路加国際大学\n",
"国分寺市\t1\t東京経済大学\n",
"北区\t1\t東京成徳大学\n"
]
}
],
"source": [
"print(f\"Total\\t{sum(counts.values())}\")\n",
"print(\"-\" * 190)\n",
"\n",
"for name in counts.keys():\n",
" print(f\"{name}\\t{counts[name]}\\t{','.join(wards[name])}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment