Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
柏議会の投票結果のテーブルhtmlから結果を抽出する
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "AllVotes.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"metadata": {
"id": "PcAGsAxUsUrf",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!pip install pyquery"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "bzn8tDoMvl8m",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/gdrive')"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "7mn9u5G9ncyQ",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from pyquery import PyQuery as pq\n",
"import re\n",
"\n",
"class ParseError(Exception):\n",
" def __init__(self, id, html):\n",
" self.id = id\n",
" self.html = html\n",
"\n",
"# 投票セルの内容から結果を決定する\n",
"def vote_decider(td):\n",
" element = pq(td)\n",
" vote = 0\n",
" # 賛成、反対の両方の画像を含むときは、会派内で意見が割れているため多数決で決める。優劣つかない場合は賛成優先。\n",
" # 除外、棄権などの数は含めない。\n",
" if element.find('img[alt=\"賛成\"]') and (element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]')):\n",
" if len(element.find('p')) != 2:\n",
" return None\n",
" yea = 0\n",
" nay = 0\n",
" for p in element.find('p'):\n",
" try:\n",
" if pq(p).find('img[alt=\"賛成\"]'):\n",
" yea = int(pq(p).text())\n",
" elif pq(p).find('img[alt=\"反対\"]') or pq(p).find('img[alt=\"×-b\"]'):\n",
" nay = int(pq(p).text())\n",
" except ValueError:\n",
" raise ParseError(id, row.html())\n",
" vote = 1 if yea >= nay else 0\n",
" else:\n",
" # 賛成、反対の双方を含まないときは、以下のどれかのパターンに該当する\n",
" if element.find('img').attr('alt') == '賛成':\n",
" vote = 1\n",
" elif element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]') or element.text() == '棄':\n",
" vote = 0\n",
" elif element.text() == '欠' or element.text() == '除':\n",
" vote = -1\n",
" else:\n",
" vote = None\n",
" \n",
" return vote\n",
" \n",
"def parse_gian(identifier, table):\n",
" d = pq(table)\n",
" gian_kind = ''\n",
" votes = []\n",
" \n",
" trs = d('tr:nth-child(n+2)')\n",
" for tr in trs:\n",
" p_tr = pq(tr)\n",
" row = []\n",
" if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n",
" continue\n",
" \n",
" # td1つにまとめられている場合は議案の種別行\n",
" if len(p_tr.find('td')) == 1:\n",
" if re.match('市長*', p_tr.text()): \n",
" gian_kind = 'P'\n",
" elif re.match('委員会*', p_tr.text()):\n",
" gian_kind = 'C'\n",
" elif re.match('継続*', p_tr.text()):\n",
" gian_kind = 'K'\n",
" else:\n",
" raise ParseError('', p_tr.html())\n",
" else:\n",
" id_td = p_tr('td:first')\n",
" id = ''\n",
" if gian_kind != '':\n",
" id = identifier + '_' + gian_kind + '_' + id_td.text()\n",
" else:\n",
" id = identifier + '_' + id_td.text()\n",
" \n",
" # 改行入ってるケースがあり邪魔なので消す\n",
" result = ''.join(p_tr('td').eq(2).text().split())\n",
" if result == '継続審査':\n",
" continue\n",
" \n",
" # 想定外のものがあったら例外とする\n",
" if result not in ['可決', '採択', '異議なし', '認可', '承認', '同意', '不採択', '認定', '否決', '可決・認定', '可決及び認定']:\n",
" raise ParseError(id, row.html())\n",
" \n",
" passed = result in ['可決', '採択', '異議なし', '認可', '承認', '同意', '認定', '可決・認定', '可決及び認定']\n",
" row.append(id)\n",
" row.append(passed)\n",
" for td in p_tr('td:nth-child(n+4)'):\n",
" vote = vote_decider(td)\n",
" if vote == None:\n",
" raise ParseError(id, p_tr.html())\n",
" row.append(vote)\n",
" votes.append(row)\n",
" return votes\n",
"\n",
"def parse_seigan(identifier, table):\n",
" d = pq(table)\n",
" gian_kind = 'N'\n",
" votes = []\n",
" seigan_id = 0\n",
" sentence_number = 0\n",
" offset = 0\n",
" with_sentence = True\n",
" have_sentence = False\n",
" skip = True\n",
" trs = d('tr')\n",
" \n",
" for tr in trs:\n",
" p_tr = pq(tr)\n",
" if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n",
" continue\n",
" \n",
" # 先頭行にtd1つで請願種別が入っているケースがあり無視したい\n",
" if ''.join(p_tr('th:first').text().split()) == '請願番号':\n",
" if p_tr('th:nth-child(3)').text() != '主旨':\n",
" with_sentence = False\n",
" skip = False\n",
" continue\n",
" \n",
" if skip:\n",
" continue\n",
" \n",
" row = []\n",
" if len(p_tr.find('td')) == 1:\n",
" if re.match('継続*', p_tr.text()):\n",
" gian_kind = 'K'\n",
" continue\n",
" else:\n",
" id_td = p_tr('td:first')\n",
" # rowspanの数だけ主旨がある\n",
" if id_td.attr('rowspan'):\n",
" seigan_id = id_td.text()\n",
" sentence_count = int(id_td.attr('rowspan'))\n",
" sentence_number = 1\n",
" offset = 0\n",
" have_sentence = True\n",
" else:\n",
" # 同一請願で異なる主旨\n",
" if have_sentence and sentence_number < sentence_count:\n",
" sentence_number = sentence_number + 1\n",
" offset = 2 # 番号とタイトル分だけtdセルが減る\n",
" else:\n",
" # 主旨が1つの請願\n",
" # colspanでタイトルと主旨のセルがまとめられているケースがあるのでoffsetを計算\n",
" have_sentence = False\n",
" seigan_id = id_td.text()\n",
" sentence_number = 1\n",
" offset = 1 if p_tr.find('td').eq(1).attr('colspan') else 0\n",
" \n",
" # 主旨を持たないテーブルのときはoffsetは固定で1とする\n",
" # 常に、colspanでまとめられている状態として扱う\n",
" if not with_sentence:\n",
" offset = 1\n",
" \n",
" id = identifier + '_' + gian_kind + '_' + seigan_id + '_' + str(sentence_number)\n",
" if p_tr('td').eq(2-offset).find('p:nth-child(2)').text() == '継続審査':\n",
" continue\n",
" \n",
" result = ''.join(p_tr('td').eq(3-offset).text().split())\n",
" if result == '継続審査' or result == '審議未了':\n",
" continue\n",
" \n",
" if result not in ['採択', '不採択', '不採択(議長裁決)']:\n",
" raise ParseError(id, p_tr.html())\n",
" \n",
" passed = result == '採択'\n",
" row.append(id)\n",
" row.append(passed)\n",
" if p_tr('td:nth-child(n+' + str(5-offset) + ')').text() == '-':\n",
" continue\n",
" \n",
" for td in p_tr('td:nth-child(n+' + str(5-offset) + ')'):\n",
" vote = vote_decider(td)\n",
" if vote == None:\n",
" raise ParseError(id, p_tr.html())\n",
" row.append(vote)\n",
" votes.append(row)\n",
" return votes"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "EvlonBIJv1hQ",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"import glob\n",
"\n",
"filepaths = glob.glob('/content/gdrive/PATH/TO/YOUR/TABLE_HTML/H*.txt')\n",
"file_reg = re.compile('^.*H(\\d+)_N(\\d+)_(.+)\\.txt$')\n",
"allVotes = []\n",
"for filepath in filepaths:\n",
" print(filepath)\n",
" result = re.match(file_reg, filepath)\n",
" identifier = result.group(1).zfill(2) + '_' + result.group(2).zfill(2) + '_' + result.group(3)\n",
" f = open(filepath)\n",
" table = f.read()\n",
" if result.group(3) == 'S':\n",
" votes = parse_seigan(identifier, table)\n",
" else:\n",
" votes = parse_gian(identifier, table)\n",
" for vote in votes:\n",
" allVotes.append(vote)\n",
" "
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "B3zMx47wx-Nm",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"import csv\n",
"\n",
"with open('/content/gdrive/PATH/TO/YOUR/ANALYZED/RESULT/votes.csv', 'w') as out:\n",
" writer = csv.writer(out)\n",
" for line in allVotes:\n",
" writer.writerow(line)"
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment