Created
December 30, 2018 05:48
-
-
Save hokuma/3785b1ccaf3bd32ecb983729514a2f6d to your computer and use it in GitHub Desktop.
柏議会の投票結果のテーブルhtmlから結果を抽出する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "AllVotes.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"metadata": { | |
"id": "PcAGsAxUsUrf", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"!pip install pyquery" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "bzn8tDoMvl8m", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/gdrive')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "7mn9u5G9ncyQ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from pyquery import PyQuery as pq\n", | |
"import re\n", | |
"\n", | |
"class ParseError(Exception):\n", | |
" def __init__(self, id, html):\n", | |
" self.id = id\n", | |
" self.html = html\n", | |
"\n", | |
"# 投票セルの内容から結果を決定する\n", | |
"def vote_decider(td):\n", | |
" element = pq(td)\n", | |
" vote = 0\n", | |
" # 賛成、反対の両方の画像を含むときは、会派内で意見が割れているため多数決で決める。優劣つかない場合は賛成優先。\n", | |
" # 除外、棄権などの数は含めない。\n", | |
" if element.find('img[alt=\"賛成\"]') and (element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]')):\n", | |
" if len(element.find('p')) != 2:\n", | |
" return None\n", | |
" yea = 0\n", | |
" nay = 0\n", | |
" for p in element.find('p'):\n", | |
" try:\n", | |
" if pq(p).find('img[alt=\"賛成\"]'):\n", | |
" yea = int(pq(p).text())\n", | |
" elif pq(p).find('img[alt=\"反対\"]') or pq(p).find('img[alt=\"×-b\"]'):\n", | |
" nay = int(pq(p).text())\n", | |
" except ValueError:\n", | |
" raise ParseError(id, row.html())\n", | |
" vote = 1 if yea >= nay else 0\n", | |
" else:\n", | |
" # 賛成、反対の双方を含まないときは、以下のどれかのパターンに該当する\n", | |
" if element.find('img').attr('alt') == '賛成':\n", | |
" vote = 1\n", | |
" elif element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]') or element.text() == '棄':\n", | |
" vote = 0\n", | |
" elif element.text() == '欠' or element.text() == '除':\n", | |
" vote = -1\n", | |
" else:\n", | |
" vote = None\n", | |
" \n", | |
" return vote\n", | |
" \n", | |
"def parse_gian(identifier, table):\n", | |
" d = pq(table)\n", | |
" gian_kind = ''\n", | |
" votes = []\n", | |
" \n", | |
" trs = d('tr:nth-child(n+2)')\n", | |
" for tr in trs:\n", | |
" p_tr = pq(tr)\n", | |
" row = []\n", | |
" if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n", | |
" continue\n", | |
" \n", | |
" # td1つにまとめられている場合は議案の種別行\n", | |
" if len(p_tr.find('td')) == 1:\n", | |
" if re.match('市長*', p_tr.text()): \n", | |
" gian_kind = 'P'\n", | |
" elif re.match('委員会*', p_tr.text()):\n", | |
" gian_kind = 'C'\n", | |
" elif re.match('継続*', p_tr.text()):\n", | |
" gian_kind = 'K'\n", | |
" else:\n", | |
" raise ParseError('', p_tr.html())\n", | |
" else:\n", | |
" id_td = p_tr('td:first')\n", | |
" id = ''\n", | |
" if gian_kind != '':\n", | |
" id = identifier + '_' + gian_kind + '_' + id_td.text()\n", | |
" else:\n", | |
" id = identifier + '_' + id_td.text()\n", | |
" \n", | |
" # 改行入ってるケースがあり邪魔なので消す\n", | |
" result = ''.join(p_tr('td').eq(2).text().split())\n", | |
" if result == '継続審査':\n", | |
" continue\n", | |
" \n", | |
" # 想定外のものがあったら例外とする\n", | |
" if result not in ['可決', '採択', '異議なし', '認可', '承認', '同意', '不採択', '認定', '否決', '可決・認定', '可決及び認定']:\n", | |
" raise ParseError(id, row.html())\n", | |
" \n", | |
" passed = result in ['可決', '採択', '異議なし', '認可', '承認', '同意', '認定', '可決・認定', '可決及び認定']\n", | |
" row.append(id)\n", | |
" row.append(passed)\n", | |
" for td in p_tr('td:nth-child(n+4)'):\n", | |
" vote = vote_decider(td)\n", | |
" if vote == None:\n", | |
" raise ParseError(id, p_tr.html())\n", | |
" row.append(vote)\n", | |
" votes.append(row)\n", | |
" return votes\n", | |
"\n", | |
"def parse_seigan(identifier, table):\n", | |
" d = pq(table)\n", | |
" gian_kind = 'N'\n", | |
" votes = []\n", | |
" seigan_id = 0\n", | |
" sentence_number = 0\n", | |
" offset = 0\n", | |
" with_sentence = True\n", | |
" have_sentence = False\n", | |
" skip = True\n", | |
" trs = d('tr')\n", | |
" \n", | |
" for tr in trs:\n", | |
" p_tr = pq(tr)\n", | |
" if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n", | |
" continue\n", | |
" \n", | |
" # 先頭行にtd1つで請願種別が入っているケースがあり無視したい\n", | |
" if ''.join(p_tr('th:first').text().split()) == '請願番号':\n", | |
" if p_tr('th:nth-child(3)').text() != '主旨':\n", | |
" with_sentence = False\n", | |
" skip = False\n", | |
" continue\n", | |
" \n", | |
" if skip:\n", | |
" continue\n", | |
" \n", | |
" row = []\n", | |
" if len(p_tr.find('td')) == 1:\n", | |
" if re.match('継続*', p_tr.text()):\n", | |
" gian_kind = 'K'\n", | |
" continue\n", | |
" else:\n", | |
" id_td = p_tr('td:first')\n", | |
" # rowspanの数だけ主旨がある\n", | |
" if id_td.attr('rowspan'):\n", | |
" seigan_id = id_td.text()\n", | |
" sentence_count = int(id_td.attr('rowspan'))\n", | |
" sentence_number = 1\n", | |
" offset = 0\n", | |
" have_sentence = True\n", | |
" else:\n", | |
" # 同一請願で異なる主旨\n", | |
" if have_sentence and sentence_number < sentence_count:\n", | |
" sentence_number = sentence_number + 1\n", | |
" offset = 2 # 番号とタイトル分だけtdセルが減る\n", | |
" else:\n", | |
" # 主旨が1つの請願\n", | |
" # colspanでタイトルと主旨のセルがまとめられているケースがあるのでoffsetを計算\n", | |
" have_sentence = False\n", | |
" seigan_id = id_td.text()\n", | |
" sentence_number = 1\n", | |
" offset = 1 if p_tr.find('td').eq(1).attr('colspan') else 0\n", | |
" \n", | |
" # 主旨を持たないテーブルのときはoffsetは固定で1とする\n", | |
" # 常に、colspanでまとめられている状態として扱う\n", | |
" if not with_sentence:\n", | |
" offset = 1\n", | |
" \n", | |
" id = identifier + '_' + gian_kind + '_' + seigan_id + '_' + str(sentence_number)\n", | |
" if p_tr('td').eq(2-offset).find('p:nth-child(2)').text() == '継続審査':\n", | |
" continue\n", | |
" \n", | |
" result = ''.join(p_tr('td').eq(3-offset).text().split())\n", | |
" if result == '継続審査' or result == '審議未了':\n", | |
" continue\n", | |
" \n", | |
" if result not in ['採択', '不採択', '不採択(議長裁決)']:\n", | |
" raise ParseError(id, p_tr.html())\n", | |
" \n", | |
" passed = result == '採択'\n", | |
" row.append(id)\n", | |
" row.append(passed)\n", | |
" if p_tr('td:nth-child(n+' + str(5-offset) + ')').text() == '-':\n", | |
" continue\n", | |
" \n", | |
" for td in p_tr('td:nth-child(n+' + str(5-offset) + ')'):\n", | |
" vote = vote_decider(td)\n", | |
" if vote == None:\n", | |
" raise ParseError(id, p_tr.html())\n", | |
" row.append(vote)\n", | |
" votes.append(row)\n", | |
" return votes" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "EvlonBIJv1hQ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import glob\n", | |
"\n", | |
"filepaths = glob.glob('/content/gdrive/PATH/TO/YOUR/TABLE_HTML/H*.txt')\n", | |
"file_reg = re.compile('^.*H(\\d+)_N(\\d+)_(.+)\\.txt$')\n", | |
"allVotes = []\n", | |
"for filepath in filepaths:\n", | |
" print(filepath)\n", | |
" result = re.match(file_reg, filepath)\n", | |
" identifier = result.group(1).zfill(2) + '_' + result.group(2).zfill(2) + '_' + result.group(3)\n", | |
" f = open(filepath)\n", | |
" table = f.read()\n", | |
" if result.group(3) == 'S':\n", | |
" votes = parse_seigan(identifier, table)\n", | |
" else:\n", | |
" votes = parse_gian(identifier, table)\n", | |
" for vote in votes:\n", | |
" allVotes.append(vote)\n", | |
" " | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "B3zMx47wx-Nm", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import csv\n", | |
"\n", | |
"with open('/content/gdrive/PATH/TO/YOUR/ANALYZED/RESULT/votes.csv', 'w') as out:\n", | |
" writer = csv.writer(out)\n", | |
" for line in allVotes:\n", | |
" writer.writerow(line)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment