Last active
January 27, 2020 09:45
-
-
Save ckhung/0c6eb0173af65ee148ec9dae9aa9c2ee to your computer and use it in GitHub Desktop.
2020立委選舉統計資料轉檔
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# 立委選舉統計資料轉檔 | |
# 詳見 https://newtoypia.blogspot.com/2020/01/xlsxlsx-csv.html | |
import sys, re, argparse | |
import pandas as pd | |
from numpy import isnan | |
from warnings import warn | |
pd.options.mode.chained_assignment = None | |
# https://stackoverflow.com/questions/49728421/pandas-dataframe-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy | |
# https://stackoverflow.com/questions/47182183/pandas-chained-assignment-warning-exception-handling | |
PARTY_LIST = [ | |
'無', | |
'合一行動聯盟', | |
'中華統一促進黨', | |
'親民黨', | |
'安定力量', | |
'台灣基進', | |
'時代力量', | |
'新黨', | |
'喜樂島聯盟', | |
'中國國民黨', | |
'一邊一國行動黨', | |
'勞動黨', | |
'綠黨', | |
'宗教聯盟', | |
'民主進步黨', | |
'台灣民眾黨', | |
'台灣維新', | |
'台澎黨', | |
'國會政黨聯盟', | |
'台灣團結聯盟' | |
] | |
def collapse_sncols(sheet, colname): | |
# collapse same-name columns: | |
# 找出 "大致" 同名的那些欄位, 加總合併 | |
# sheet 的內容會被直接修改掉! | |
all_colname = list(filter(lambda x: x[:-1] == colname, sheet.columns)) | |
if len(all_colname) == 1: | |
sheet.rename( columns={all_colname[0]:colname}, inplace=True ) | |
elif len(all_colname) > 1: | |
# sheet.insert(len(sheet.columns)-1, colname, 0) | |
# 本來要插在 '選舉人數' 前面。 現在 '選舉人數' 已搬到最前面 | |
sheet[colname] = 0 | |
for cn in all_colname: | |
sheet[colname] += sheet[cn] | |
sheet.drop(labels=all_colname, axis='columns', inplace=True) | |
def sheet2csv(sheet, savepath='', prepend={}): | |
allcn = list(sheet.columns) | |
effective = list(filter(lambda x: 'Unnamed' in x, allcn))[3] | |
# 找出「有效票數」欄位 | |
sheet.rename( columns={ | |
allcn[0]: '鄉鎮區', | |
allcn[1]: '村里', | |
effective: '有效票數', | |
allcn[-2]: '選舉人數', | |
}, inplace=True ) | |
allcn = list(sheet.columns) | |
cut = sheet[['選舉人數','有效票數']] | |
sheet.drop(['有效票數', '選舉人數'], axis='columns', inplace=True) | |
sheet.insert(2, '選舉人數', cut['選舉人數']) | |
sheet.insert(3, '有效票數', cut['有效票數']) | |
# 把 '選舉人數' 欄移到前面。 | |
# 注意: 用 sheet = sheet[新順序] 是無效的, | |
# 因為 sheet 是 local variable | |
rename_dict = {} | |
ptid = { pt:ord('A') for pt in PARTY_LIST } | |
# 一黨有多位候選人時, 按 A,B,C, ... 設流水編號 | |
for cn in allcn: | |
m = re.search(r'\(\d+\)\n.*?\n(\S+)', cn) | |
if m: | |
pt = m.group(1) | |
if not pt in PARTY_LIST: | |
warn('小黨 "{}" 以無黨計算'.format(pt)) | |
pt = '無' | |
rename_dict[cn] = pt + chr(ptid[pt]) | |
ptid[pt] += 1 | |
sheet.rename( columns=rename_dict, inplace=True ) | |
sheet['鄉鎮區'].replace(r' (.*)', r'\1', inplace=True, regex=True) | |
sheet['鄉鎮區'].fillna(method='ffill', inplace=True) | |
#keep_only = list(filter(lambda x: re.search(r'%', x), sheet.columns)) | |
#sheet = sheet[ keep_only ] | |
to_drop = list(filter(lambda x: re.search(r'Unnamed', x), sheet.columns)) | |
sheet.drop(to_drop, axis='columns', inplace=True) | |
#rename_dict = { cn: cn[1:] for cn in sheet.columns } | |
#sheet.rename( columns=rename_dict, inplace=True ) | |
all_candidates = list( filter( | |
lambda x: x[:-1] in PARTY_LIST, | |
sheet.columns | |
) ) | |
for col in all_candidates + ['選舉人數', '有效票數']: | |
sheet[col] = sheet[col].astype(str).str.replace(",","") | |
sheet[col] = sheet[col].replace(r'[^\d.]', '0', regex=True).astype(float).astype(int) | |
sheet = sheet.dropna(subset=['鄉鎮區']) | |
sheet['村里'].fillna('小計', inplace=True) | |
for pt in PARTY_LIST: | |
collapse_sncols(sheet, pt) | |
sheet['驗算'] = 0 | |
for pt in PARTY_LIST: | |
if pt in sheet.columns: | |
sheet['驗算'] += sheet[pt] | |
verify = sheet['驗算'] - sheet['有效票數'] | |
assert(sum(verify * verify)==0) | |
sheet.drop(labels='驗算', axis='columns', inplace=True) | |
allpk = list(prepend.keys()) | |
allpk.reverse() | |
for pk in allpk: | |
sheet.insert(0, pk, prepend[pk]) | |
#print(sheet) | |
sheet.to_csv(savepath if savepath else sys.stdout, index=None, encoding='utf-8') | |
########################################################### | |
# main | |
########################################################### | |
parser = argparse.ArgumentParser( | |
description='2020立委得票統計轉檔', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-s', '--savepath', type=str, default='/tmp', | |
help='儲存csv檔的目錄') | |
parser.add_argument('xlsfiles', nargs='*', help='xls檔 xls ...') | |
args = parser.parse_args() | |
#all_xls_fn = os.listdir() | |
#all_xls_fn = filter(lambda x: re.search('一覽表\(.*(縣|市)\)\.xls$',x), all_xls_fn) | |
#all_xls_fn = ['平地立委-A05-4-得票數一覽表(彰化縣).xls'] | |
#all_xls_fn = ['區域立委-A05-2-得票數一覽表(花蓮縣).xls'] | |
#all_xls_fn = ['區域立委-A05-2-得票數一覽表(臺中市).xls'] | |
all_xls_fn = args.xlsfiles | |
for fn in all_xls_fn: | |
m = re.search(r'(\S+?)-.*?\((.*?)\)', fn) | |
elec_type, city = (m.group(1), m.group(2)) | |
all_sheets = pd.read_excel(fn, None, encoding='utf-8', skiprows=2, thousandsstr=',') | |
all_sn = all_sheets.keys() # sheet names | |
#all_sn =['臺中市第4選舉區'] | |
for sn in all_sn: | |
print('{} : {}'.format(fn, sn)) | |
sheet2csv(all_sheets[sn], prepend={'選票類型':elec_type, '縣市':city}, savepath=args.savepath+'/{}_{}.csv'.format(elec_type, sn)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
原本刪掉的 鄉鎮區小計及縣市總計 現在把它加回來。