hokuma/AllVotes.ipynb

## AllVotes.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "AllVotes.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "metadata": {
        "id": "PcAGsAxUsUrf",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "!pip install pyquery"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "bzn8tDoMvl8m",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "7mn9u5G9ncyQ",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from pyquery import PyQuery as pq\n",
        "import re\n",
        "\n",
        "class ParseError(Exception):\n",
        "  def __init__(self, id, html):\n",
        "    self.id = id\n",
        "    self.html = html\n",
        "\n",
        "# 投票セルの内容から結果を決定する\n",
        "def vote_decider(td):\n",
        "  element = pq(td)\n",
        "  vote = 0\n",
        "  # 賛成、反対の両方の画像を含むときは、会派内で意見が割れているため多数決で決める。優劣つかない場合は賛成優先。\n",
        "  # 除外、棄権などの数は含めない。\n",
        "  if element.find('img[alt=\"賛成\"]') and (element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]')):\n",
        "    if len(element.find('p')) != 2:\n",
        "      return None\n",
        "    yea = 0\n",
        "    nay = 0\n",
        "    for p in element.find('p'):\n",
        "      try:\n",
        "        if pq(p).find('img[alt=\"賛成\"]'):\n",
        "          yea = int(pq(p).text())\n",
        "        elif pq(p).find('img[alt=\"反対\"]') or pq(p).find('img[alt=\"×-b\"]'):\n",
        "          nay = int(pq(p).text())\n",
        "      except ValueError:\n",
        "        raise ParseError(id, row.html())\n",
        "      vote = 1 if yea >= nay else 0\n",
        "  else:\n",
        "    # 賛成、反対の双方を含まないときは、以下のどれかのパターンに該当する\n",
        "    if element.find('img').attr('alt') == '賛成':\n",
        "      vote = 1\n",
        "    elif element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]') or element.text() == '棄':\n",
        "      vote = 0\n",
        "    elif element.text() == '欠' or element.text() == '除':\n",
        "      vote = -1\n",
        "    else:\n",
        "      vote = None\n",
        "      \n",
        "  return vote\n",
        "      \n",
        "def parse_gian(identifier, table):\n",
        "  d = pq(table)\n",
        "  gian_kind = ''\n",
        "  votes = []\n",
        "  \n",
        "  trs = d('tr:nth-child(n+2)')\n",
        "  for tr in trs:\n",
        "    p_tr = pq(tr)\n",
        "    row = []\n",
        "    if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n",
        "      continue\n",
        "    \n",
        "    # td1つにまとめられている場合は議案の種別行\n",
        "    if len(p_tr.find('td')) == 1:\n",
        "      if re.match('市長*', p_tr.text()): \n",
        "        gian_kind = 'P'\n",
        "      elif re.match('委員会*', p_tr.text()):\n",
        "        gian_kind = 'C'\n",
        "      elif re.match('継続*', p_tr.text()):\n",
        "        gian_kind = 'K'\n",
        "      else:\n",
        "        raise ParseError('', p_tr.html())\n",
        "    else:\n",
        "      id_td = p_tr('td:first')\n",
        "      id = ''\n",
        "      if gian_kind != '':\n",
        "        id = identifier + '_' + gian_kind + '_' + id_td.text()\n",
        "      else:\n",
        "        id = identifier + '_' + id_td.text()\n",
        "        \n",
        "      # 改行入ってるケースがあり邪魔なので消す\n",
        "      result = ''.join(p_tr('td').eq(2).text().split())\n",
        "      if result == '継続審査':\n",
        "        continue\n",
        "      \n",
        "      # 想定外のものがあったら例外とする\n",
        "      if result not in ['可決', '採択', '異議なし', '認可', '承認', '同意', '不採択', '認定', '否決', '可決・認定', '可決及び認定']:\n",
        "        raise ParseError(id, row.html())\n",
        "        \n",
        "      passed = result in ['可決', '採択', '異議なし', '認可', '承認', '同意', '認定', '可決・認定', '可決及び認定']\n",
        "      row.append(id)\n",
        "      row.append(passed)\n",
        "      for td in p_tr('td:nth-child(n+4)'):\n",
        "        vote = vote_decider(td)\n",
        "        if vote == None:\n",
        "          raise ParseError(id, p_tr.html())\n",
        "        row.append(vote)\n",
        "      votes.append(row)\n",
        "  return votes\n",
        "\n",
        "def parse_seigan(identifier, table):\n",
        "  d = pq(table)\n",
        "  gian_kind = 'N'\n",
        "  votes = []\n",
        "  seigan_id = 0\n",
        "  sentence_number = 0\n",
        "  offset = 0\n",
        "  with_sentence = True\n",
        "  have_sentence = False\n",
        "  skip = True\n",
        "  trs = d('tr')\n",
        "    \n",
        "  for tr in trs:\n",
        "    p_tr = pq(tr)\n",
        "    if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n",
        "      continue\n",
        "    \n",
        "    # 先頭行にtd1つで請願種別が入っているケースがあり無視したい\n",
        "    if ''.join(p_tr('th:first').text().split()) == '請願番号':\n",
        "      if p_tr('th:nth-child(3)').text() != '主旨':\n",
        "        with_sentence = False\n",
        "      skip = False\n",
        "      continue\n",
        "      \n",
        "    if skip:\n",
        "      continue\n",
        "      \n",
        "    row = []\n",
        "    if len(p_tr.find('td')) == 1:\n",
        "      if re.match('継続*', p_tr.text()):\n",
        "        gian_kind = 'K'\n",
        "        continue\n",
        "    else:\n",
        "      id_td = p_tr('td:first')\n",
        "      # rowspanの数だけ主旨がある\n",
        "      if id_td.attr('rowspan'):\n",
        "        seigan_id = id_td.text()\n",
        "        sentence_count = int(id_td.attr('rowspan'))\n",
        "        sentence_number = 1\n",
        "        offset = 0\n",
        "        have_sentence = True\n",
        "      else:\n",
        "        # 同一請願で異なる主旨\n",
        "        if have_sentence and sentence_number < sentence_count:\n",
        "          sentence_number = sentence_number + 1\n",
        "          offset = 2 # 番号とタイトル分だけtdセルが減る\n",
        "        else:\n",
        "          # 主旨が1つの請願\n",
        "          # colspanでタイトルと主旨のセルがまとめられているケースがあるのでoffsetを計算\n",
        "          have_sentence = False\n",
        "          seigan_id = id_td.text()\n",
        "          sentence_number = 1\n",
        "          offset = 1 if p_tr.find('td').eq(1).attr('colspan') else 0\n",
        "          \n",
        "      # 主旨を持たないテーブルのときはoffsetは固定で1とする\n",
        "      # 常に、colspanでまとめられている状態として扱う\n",
        "      if not with_sentence:\n",
        "        offset = 1\n",
        "          \n",
        "      id = identifier + '_' + gian_kind + '_' + seigan_id + '_' + str(sentence_number)\n",
        "      if p_tr('td').eq(2-offset).find('p:nth-child(2)').text() == '継続審査':\n",
        "        continue\n",
        "        \n",
        "      result = ''.join(p_tr('td').eq(3-offset).text().split())\n",
        "      if result == '継続審査' or result == '審議未了':\n",
        "        continue\n",
        "        \n",
        "      if result not in ['採択', '不採択', '不採択（議長裁決）']:\n",
        "        raise ParseError(id, p_tr.html())\n",
        "        \n",
        "      passed = result == '採択'\n",
        "      row.append(id)\n",
        "      row.append(passed)\n",
        "      if p_tr('td:nth-child(n+' + str(5-offset) + ')').text() == '－':\n",
        "        continue\n",
        "        \n",
        "      for td in p_tr('td:nth-child(n+' + str(5-offset) + ')'):\n",
        "        vote = vote_decider(td)\n",
        "        if vote == None:\n",
        "          raise ParseError(id, p_tr.html())\n",
        "        row.append(vote)\n",
        "      votes.append(row)\n",
        "  return votes"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "EvlonBIJv1hQ",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import glob\n",
        "\n",
        "filepaths = glob.glob('/content/gdrive/PATH/TO/YOUR/TABLE_HTML/H*.txt')\n",
        "file_reg = re.compile('^.*H(\\d+)_N(\\d+)_(.+)\\.txt$')\n",
        "allVotes = []\n",
        "for filepath in filepaths:\n",
        "  print(filepath)\n",
        "  result = re.match(file_reg, filepath)\n",
        "  identifier = result.group(1).zfill(2) + '_' + result.group(2).zfill(2) + '_' + result.group(3)\n",
        "  f = open(filepath)\n",
        "  table = f.read()\n",
        "  if result.group(3) == 'S':\n",
        "    votes = parse_seigan(identifier, table)\n",
        "  else:\n",
        "    votes = parse_gian(identifier, table)\n",
        "  for vote in votes:\n",
        "    allVotes.append(vote)\n",
        "  "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "B3zMx47wx-Nm",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import csv\n",
        "\n",
        "with open('/content/gdrive/PATH/TO/YOUR/ANALYZED/RESULT/votes.csv', 'w') as out:\n",
        "  writer = csv.writer(out)\n",
        "  for line in allVotes:\n",
        "    writer.writerow(line)"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "AllVotes.ipynb",
	"version": "0.3.2",
	"provenance": [],
	"collapsed_sections": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"metadata": {
	"id": "PcAGsAxUsUrf",
	"colab_type": "code",
	"colab": {}
	},
	"cell_type": "code",
	"source": [
	"!pip install pyquery"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"metadata": {
	"id": "bzn8tDoMvl8m",
	"colab_type": "code",
	"colab": {}
	},
	"cell_type": "code",
	"source": [
	"from google.colab import drive\n",
	"drive.mount('/content/gdrive')"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"metadata": {
	"id": "7mn9u5G9ncyQ",
	"colab_type": "code",
	"colab": {}
	},
	"cell_type": "code",
	"source": [
	"from pyquery import PyQuery as pq\n",
	"import re\n",
	"\n",
	"class ParseError(Exception):\n",
	" def __init__(self, id, html):\n",
	" self.id = id\n",
	" self.html = html\n",
	"\n",
	"# 投票セルの内容から結果を決定する\n",
	"def vote_decider(td):\n",
	" element = pq(td)\n",
	" vote = 0\n",
	" # 賛成、反対の両方の画像を含むときは、会派内で意見が割れているため多数決で決める。優劣つかない場合は賛成優先。\n",
	" # 除外、棄権などの数は含めない。\n",
	" if element.find('img[alt=\"賛成\"]') and (element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]')):\n",
	" if len(element.find('p')) != 2:\n",
	" return None\n",
	" yea = 0\n",
	" nay = 0\n",
	" for p in element.find('p'):\n",
	" try:\n",
	" if pq(p).find('img[alt=\"賛成\"]'):\n",
	" yea = int(pq(p).text())\n",
	" elif pq(p).find('img[alt=\"反対\"]') or pq(p).find('img[alt=\"×-b\"]'):\n",
	" nay = int(pq(p).text())\n",
	" except ValueError:\n",
	" raise ParseError(id, row.html())\n",
	" vote = 1 if yea >= nay else 0\n",
	" else:\n",
	" # 賛成、反対の双方を含まないときは、以下のどれかのパターンに該当する\n",
	" if element.find('img').attr('alt') == '賛成':\n",
	" vote = 1\n",
	" elif element.find('img[alt=\"反対\"]') or element.find('img[alt=\"×-b\"]') or element.text() == '棄':\n",
	" vote = 0\n",
	" elif element.text() == '欠' or element.text() == '除':\n",
	" vote = -1\n",
	" else:\n",
	" vote = None\n",
	" \n",
	" return vote\n",
	" \n",
	"def parse_gian(identifier, table):\n",
	" d = pq(table)\n",
	" gian_kind = ''\n",
	" votes = []\n",
	" \n",
	" trs = d('tr:nth-child(n+2)')\n",
	" for tr in trs:\n",
	" p_tr = pq(tr)\n",
	" row = []\n",
	" if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n",
	" continue\n",
	" \n",
	" # td1つにまとめられている場合は議案の種別行\n",
	" if len(p_tr.find('td')) == 1:\n",
	" if re.match('市長*', p_tr.text()): \n",
	" gian_kind = 'P'\n",
	" elif re.match('委員会*', p_tr.text()):\n",
	" gian_kind = 'C'\n",
	" elif re.match('継続*', p_tr.text()):\n",
	" gian_kind = 'K'\n",
	" else:\n",
	" raise ParseError('', p_tr.html())\n",
	" else:\n",
	" id_td = p_tr('td:first')\n",
	" id = ''\n",
	" if gian_kind != '':\n",
	" id = identifier + '_' + gian_kind + '_' + id_td.text()\n",
	" else:\n",
	" id = identifier + '_' + id_td.text()\n",
	" \n",
	" # 改行入ってるケースがあり邪魔なので消す\n",
	" result = ''.join(p_tr('td').eq(2).text().split())\n",
	" if result == '継続審査':\n",
	" continue\n",
	" \n",
	" # 想定外のものがあったら例外とする\n",
	" if result not in ['可決', '採択', '異議なし', '認可', '承認', '同意', '不採択', '認定', '否決', '可決・認定', '可決及び認定']:\n",
	" raise ParseError(id, row.html())\n",
	" \n",
	" passed = result in ['可決', '採択', '異議なし', '認可', '承認', '同意', '認定', '可決・認定', '可決及び認定']\n",
	" row.append(id)\n",
	" row.append(passed)\n",
	" for td in p_tr('td:nth-child(n+4)'):\n",
	" vote = vote_decider(td)\n",
	" if vote == None:\n",
	" raise ParseError(id, p_tr.html())\n",
	" row.append(vote)\n",
	" votes.append(row)\n",
	" return votes\n",
	"\n",
	"def parse_seigan(identifier, table):\n",
	" d = pq(table)\n",
	" gian_kind = 'N'\n",
	" votes = []\n",
	" seigan_id = 0\n",
	" sentence_number = 0\n",
	" offset = 0\n",
	" with_sentence = True\n",
	" have_sentence = False\n",
	" skip = True\n",
	" trs = d('tr')\n",
	" \n",
	" for tr in trs:\n",
	" p_tr = pq(tr)\n",
	" if len(p_tr('td')) == 0 and len(p_tr('th')) == 0:\n",
	" continue\n",
	" \n",
	" # 先頭行にtd1つで請願種別が入っているケースがあり無視したい\n",
	" if ''.join(p_tr('th:first').text().split()) == '請願番号':\n",
	" if p_tr('th:nth-child(3)').text() != '主旨':\n",
	" with_sentence = False\n",
	" skip = False\n",
	" continue\n",
	" \n",
	" if skip:\n",
	" continue\n",
	" \n",
	" row = []\n",
	" if len(p_tr.find('td')) == 1:\n",
	" if re.match('継続*', p_tr.text()):\n",
	" gian_kind = 'K'\n",
	" continue\n",
	" else:\n",
	" id_td = p_tr('td:first')\n",
	" # rowspanの数だけ主旨がある\n",
	" if id_td.attr('rowspan'):\n",
	" seigan_id = id_td.text()\n",
	" sentence_count = int(id_td.attr('rowspan'))\n",
	" sentence_number = 1\n",
	" offset = 0\n",
	" have_sentence = True\n",
	" else:\n",
	" # 同一請願で異なる主旨\n",
	" if have_sentence and sentence_number < sentence_count:\n",
	" sentence_number = sentence_number + 1\n",
	" offset = 2 # 番号とタイトル分だけtdセルが減る\n",
	" else:\n",
	" # 主旨が1つの請願\n",
	" # colspanでタイトルと主旨のセルがまとめられているケースがあるのでoffsetを計算\n",
	" have_sentence = False\n",
	" seigan_id = id_td.text()\n",
	" sentence_number = 1\n",
	" offset = 1 if p_tr.find('td').eq(1).attr('colspan') else 0\n",
	" \n",
	" # 主旨を持たないテーブルのときはoffsetは固定で1とする\n",
	" # 常に、colspanでまとめられている状態として扱う\n",
	" if not with_sentence:\n",
	" offset = 1\n",
	" \n",
	" id = identifier + '_' + gian_kind + '_' + seigan_id + '_' + str(sentence_number)\n",
	" if p_tr('td').eq(2-offset).find('p:nth-child(2)').text() == '継続審査':\n",
	" continue\n",
	" \n",
	" result = ''.join(p_tr('td').eq(3-offset).text().split())\n",
	" if result == '継続審査' or result == '審議未了':\n",
	" continue\n",
	" \n",
	" if result not in ['採択', '不採択', '不採択（議長裁決）']:\n",
	" raise ParseError(id, p_tr.html())\n",
	" \n",
	" passed = result == '採択'\n",
	" row.append(id)\n",
	" row.append(passed)\n",
	" if p_tr('td:nth-child(n+' + str(5-offset) + ')').text() == '－':\n",
	" continue\n",
	" \n",
	" for td in p_tr('td:nth-child(n+' + str(5-offset) + ')'):\n",
	" vote = vote_decider(td)\n",
	" if vote == None:\n",
	" raise ParseError(id, p_tr.html())\n",
	" row.append(vote)\n",
	" votes.append(row)\n",
	" return votes"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"metadata": {
	"id": "EvlonBIJv1hQ",
	"colab_type": "code",
	"colab": {}
	},
	"cell_type": "code",
	"source": [
	"import glob\n",
	"\n",
	"filepaths = glob.glob('/content/gdrive/PATH/TO/YOUR/TABLE_HTML/H*.txt')\n",
	"file_reg = re.compile('^.*H(\\d+)_N(\\d+)_(.+)\\.txt$')\n",
	"allVotes = []\n",
	"for filepath in filepaths:\n",
	" print(filepath)\n",
	" result = re.match(file_reg, filepath)\n",
	" identifier = result.group(1).zfill(2) + '_' + result.group(2).zfill(2) + '_' + result.group(3)\n",
	" f = open(filepath)\n",
	" table = f.read()\n",
	" if result.group(3) == 'S':\n",
	" votes = parse_seigan(identifier, table)\n",
	" else:\n",
	" votes = parse_gian(identifier, table)\n",
	" for vote in votes:\n",
	" allVotes.append(vote)\n",
	" "
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"metadata": {
	"id": "B3zMx47wx-Nm",
	"colab_type": "code",
	"colab": {}
	},
	"cell_type": "code",
	"source": [
	"import csv\n",
	"\n",
	"with open('/content/gdrive/PATH/TO/YOUR/ANALYZED/RESULT/votes.csv', 'w') as out:\n",
	" writer = csv.writer(out)\n",
	" for line in allVotes:\n",
	" writer.writerow(line)"
	],
	"execution_count": 0,
	"outputs": []
	}
	]
	}