Skip to content

Instantly share code, notes, and snippets.

@ckhung
Last active January 27, 2020 09:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ckhung/0c6eb0173af65ee148ec9dae9aa9c2ee to your computer and use it in GitHub Desktop.
Save ckhung/0c6eb0173af65ee148ec9dae9aa9c2ee to your computer and use it in GitHub Desktop.
2020立委選舉統計資料轉檔
#!/usr/bin/python3
# 立委選舉統計資料轉檔
# 詳見 https://newtoypia.blogspot.com/2020/01/xlsxlsx-csv.html
import sys, re, argparse
import pandas as pd
from numpy import isnan
from warnings import warn
pd.options.mode.chained_assignment = None
# https://stackoverflow.com/questions/49728421/pandas-dataframe-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy
# https://stackoverflow.com/questions/47182183/pandas-chained-assignment-warning-exception-handling
PARTY_LIST = [
'無',
'合一行動聯盟',
'中華統一促進黨',
'親民黨',
'安定力量',
'台灣基進',
'時代力量',
'新黨',
'喜樂島聯盟',
'中國國民黨',
'一邊一國行動黨',
'勞動黨',
'綠黨',
'宗教聯盟',
'民主進步黨',
'台灣民眾黨',
'台灣維新',
'台澎黨',
'國會政黨聯盟',
'台灣團結聯盟'
]
def collapse_sncols(sheet, colname):
# collapse same-name columns:
# 找出 "大致" 同名的那些欄位, 加總合併
# sheet 的內容會被直接修改掉!
all_colname = list(filter(lambda x: x[:-1] == colname, sheet.columns))
if len(all_colname) == 1:
sheet.rename( columns={all_colname[0]:colname}, inplace=True )
elif len(all_colname) > 1:
# sheet.insert(len(sheet.columns)-1, colname, 0)
# 本來要插在 '選舉人數' 前面。 現在 '選舉人數' 已搬到最前面
sheet[colname] = 0
for cn in all_colname:
sheet[colname] += sheet[cn]
sheet.drop(labels=all_colname, axis='columns', inplace=True)
def sheet2csv(sheet, savepath='', prepend={}):
allcn = list(sheet.columns)
effective = list(filter(lambda x: 'Unnamed' in x, allcn))[3]
# 找出「有效票數」欄位
sheet.rename( columns={
allcn[0]: '鄉鎮區',
allcn[1]: '村里',
effective: '有效票數',
allcn[-2]: '選舉人數',
}, inplace=True )
allcn = list(sheet.columns)
cut = sheet[['選舉人數','有效票數']]
sheet.drop(['有效票數', '選舉人數'], axis='columns', inplace=True)
sheet.insert(2, '選舉人數', cut['選舉人數'])
sheet.insert(3, '有效票數', cut['有效票數'])
# 把 '選舉人數' 欄移到前面。
# 注意: 用 sheet = sheet[新順序] 是無效的,
# 因為 sheet 是 local variable
rename_dict = {}
ptid = { pt:ord('A') for pt in PARTY_LIST }
# 一黨有多位候選人時, 按 A,B,C, ... 設流水編號
for cn in allcn:
m = re.search(r'\(\d+\)\n.*?\n(\S+)', cn)
if m:
pt = m.group(1)
if not pt in PARTY_LIST:
warn('小黨 "{}" 以無黨計算'.format(pt))
pt = '無'
rename_dict[cn] = pt + chr(ptid[pt])
ptid[pt] += 1
sheet.rename( columns=rename_dict, inplace=True )
sheet['鄉鎮區'].replace(r' (.*)', r'\1', inplace=True, regex=True)
sheet['鄉鎮區'].fillna(method='ffill', inplace=True)
#keep_only = list(filter(lambda x: re.search(r'%', x), sheet.columns))
#sheet = sheet[ keep_only ]
to_drop = list(filter(lambda x: re.search(r'Unnamed', x), sheet.columns))
sheet.drop(to_drop, axis='columns', inplace=True)
#rename_dict = { cn: cn[1:] for cn in sheet.columns }
#sheet.rename( columns=rename_dict, inplace=True )
all_candidates = list( filter(
lambda x: x[:-1] in PARTY_LIST,
sheet.columns
) )
for col in all_candidates + ['選舉人數', '有效票數']:
sheet[col] = sheet[col].astype(str).str.replace(",","")
sheet[col] = sheet[col].replace(r'[^\d.]', '0', regex=True).astype(float).astype(int)
sheet = sheet.dropna(subset=['鄉鎮區'])
sheet['村里'].fillna('小計', inplace=True)
for pt in PARTY_LIST:
collapse_sncols(sheet, pt)
sheet['驗算'] = 0
for pt in PARTY_LIST:
if pt in sheet.columns:
sheet['驗算'] += sheet[pt]
verify = sheet['驗算'] - sheet['有效票數']
assert(sum(verify * verify)==0)
sheet.drop(labels='驗算', axis='columns', inplace=True)
allpk = list(prepend.keys())
allpk.reverse()
for pk in allpk:
sheet.insert(0, pk, prepend[pk])
#print(sheet)
sheet.to_csv(savepath if savepath else sys.stdout, index=None, encoding='utf-8')
###########################################################
# main
###########################################################
parser = argparse.ArgumentParser(
description='2020立委得票統計轉檔',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-s', '--savepath', type=str, default='/tmp',
help='儲存csv檔的目錄')
parser.add_argument('xlsfiles', nargs='*', help='xls檔 xls ...')
args = parser.parse_args()
#all_xls_fn = os.listdir()
#all_xls_fn = filter(lambda x: re.search('一覽表\(.*(縣|市)\)\.xls$',x), all_xls_fn)
#all_xls_fn = ['平地立委-A05-4-得票數一覽表(彰化縣).xls']
#all_xls_fn = ['區域立委-A05-2-得票數一覽表(花蓮縣).xls']
#all_xls_fn = ['區域立委-A05-2-得票數一覽表(臺中市).xls']
all_xls_fn = args.xlsfiles
for fn in all_xls_fn:
m = re.search(r'(\S+?)-.*?\((.*?)\)', fn)
elec_type, city = (m.group(1), m.group(2))
all_sheets = pd.read_excel(fn, None, encoding='utf-8', skiprows=2, thousandsstr=',')
all_sn = all_sheets.keys() # sheet names
#all_sn =['臺中市第4選舉區']
for sn in all_sn:
print('{} : {}'.format(fn, sn))
sheet2csv(all_sheets[sn], prepend={'選票類型':elec_type, '縣市':city}, savepath=args.savepath+'/{}_{}.csv'.format(elec_type, sn))
@ckhung
Copy link
Author

ckhung commented Jan 27, 2020

原本刪掉的 鄉鎮區小計及縣市總計 現在把它加回來。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment