ckhung/elecxls2csv.py

## elecxls2csv.py
#!/usr/bin/python3

# 立委選舉統計資料轉檔
# 詳見 https://newtoypia.blogspot.com/2020/01/xlsxlsx-csv.html

import sys, re, argparse
import pandas as pd
from numpy import isnan
from warnings import warn

pd.options.mode.chained_assignment = None
# https://stackoverflow.com/questions/49728421/pandas-dataframe-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy
# https://stackoverflow.com/questions/47182183/pandas-chained-assignment-warning-exception-handling

PARTY_LIST = [
    '無',
    '合一行動聯盟',
    '中華統一促進黨',
    '親民黨',
    '安定力量',
    '台灣基進',
    '時代力量',
    '新黨',
    '喜樂島聯盟',
    '中國國民黨',
    '一邊一國行動黨',
    '勞動黨',
    '綠黨',
    '宗教聯盟',
    '民主進步黨',
    '台灣民眾黨',
    '台灣維新',
    '台澎黨',
    '國會政黨聯盟',
    '台灣團結聯盟'
]

def collapse_sncols(sheet, colname):
    # collapse same-name columns:
    # 找出 "大致" 同名的那些欄位， 加總合併
    # sheet 的內容會被直接修改掉！
    all_colname = list(filter(lambda x: x[:-1] == colname, sheet.columns))
    if len(all_colname) == 1:
        sheet.rename( columns={all_colname[0]:colname}, inplace=True )
    elif len(all_colname) > 1:
        # sheet.insert(len(sheet.columns)-1, colname, 0)
        # 本來要插在 '選舉人數' 前面。 現在 '選舉人數' 已搬到最前面
        sheet[colname] = 0
        for cn in all_colname:
            sheet[colname] += sheet[cn]
        sheet.drop(labels=all_colname, axis='columns', inplace=True)

def sheet2csv(sheet, savepath='', prepend={}):
    allcn = list(sheet.columns)
    effective = list(filter(lambda x: 'Unnamed' in x, allcn))[3]
    # 找出「有效票數」欄位
    sheet.rename( columns={
        allcn[0]: '鄉鎮區',
        allcn[1]: '村里',
        effective: '有效票數',
        allcn[-2]: '選舉人數',
    }, inplace=True )
    allcn = list(sheet.columns)
    cut = sheet[['選舉人數','有效票數']]
    sheet.drop(['有效票數', '選舉人數'], axis='columns', inplace=True)
    sheet.insert(2, '選舉人數', cut['選舉人數'])
    sheet.insert(3, '有效票數', cut['有效票數'])
    # 把 '選舉人數' 欄移到前面。
    # 注意： 用 sheet = sheet[新順序] 是無效的，
    # 因為 sheet 是 local variable
    rename_dict = {}
    ptid = { pt:ord('A') for pt in PARTY_LIST }
    # 一黨有多位候選人時， 按 A,B,C, ... 設流水編號
    for cn in allcn:
        m = re.search(r'\(\d+\)\n.*?\n(\S+)', cn)
        if m:
            pt = m.group(1)
            if not pt in PARTY_LIST:
                warn('小黨 "{}" 以無黨計算'.format(pt))
                pt = '無'
            rename_dict[cn] = pt + chr(ptid[pt])
            ptid[pt] += 1
    sheet.rename( columns=rename_dict, inplace=True )
    sheet['鄉鎮區'].replace(r'　(.*)', r'\1', inplace=True, regex=True)
    sheet['鄉鎮區'].fillna(method='ffill', inplace=True)
    #keep_only = list(filter(lambda x: re.search(r'%', x), sheet.columns))
    #sheet = sheet[ keep_only ]
    to_drop = list(filter(lambda x: re.search(r'Unnamed', x), sheet.columns))
    sheet.drop(to_drop, axis='columns', inplace=True)
    #rename_dict = { cn: cn[1:] for cn in sheet.columns }
    #sheet.rename( columns=rename_dict, inplace=True )
    all_candidates = list( filter(
        lambda x: x[:-1] in PARTY_LIST,
        sheet.columns
    ) )
    for col in all_candidates + ['選舉人數', '有效票數']:
        sheet[col] = sheet[col].astype(str).str.replace(",","")
        sheet[col] = sheet[col].replace(r'[^\d.]', '0', regex=True).astype(float).astype(int)
    sheet = sheet.dropna(subset=['鄉鎮區'])
    sheet['村里'].fillna('小計', inplace=True)
    for pt in PARTY_LIST:
        collapse_sncols(sheet, pt)

    sheet['驗算'] = 0
    for pt in PARTY_LIST:
        if pt in sheet.columns:
            sheet['驗算'] += sheet[pt]
    verify = sheet['驗算'] - sheet['有效票數']
    assert(sum(verify * verify)==0)
    sheet.drop(labels='驗算', axis='columns', inplace=True)

    allpk = list(prepend.keys())
    allpk.reverse()
    for pk in allpk:
        sheet.insert(0, pk, prepend[pk])
    #print(sheet)
    sheet.to_csv(savepath if savepath else sys.stdout, index=None, encoding='utf-8')


###########################################################
# main
###########################################################

parser = argparse.ArgumentParser(
    description='2020立委得票統計轉檔',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-s', '--savepath', type=str, default='/tmp',
    help='儲存csv檔的目錄')
parser.add_argument('xlsfiles', nargs='*', help='xls檔 xls ...')
args = parser.parse_args()

#all_xls_fn = os.listdir()
#all_xls_fn = filter(lambda x: re.search('一覽表\(.*(縣|市)\)\.xls$',x), all_xls_fn)
#all_xls_fn = ['平地立委-A05-4-得票數一覽表(彰化縣).xls']
#all_xls_fn = ['區域立委-A05-2-得票數一覽表(花蓮縣).xls']
#all_xls_fn = ['區域立委-A05-2-得票數一覽表(臺中市).xls']
all_xls_fn = args.xlsfiles
for fn in all_xls_fn:
    m = re.search(r'(\S+?)-.*?\((.*?)\)', fn)
    elec_type, city = (m.group(1), m.group(2))
    all_sheets = pd.read_excel(fn, None, encoding='utf-8', skiprows=2, thousandsstr=',')
    all_sn = all_sheets.keys()   # sheet names
    #all_sn =['臺中市第4選舉區']
    for sn in all_sn:
        print('{} ： {}'.format(fn, sn))
        sheet2csv(all_sheets[sn], prepend={'選票類型':elec_type, '縣市':city}, savepath=args.savepath+'/{}_{}.csv'.format(elec_type, sn))
	#!/usr/bin/python3

	# 立委選舉統計資料轉檔
	# 詳見 https://newtoypia.blogspot.com/2020/01/xlsxlsx-csv.html

	import sys, re, argparse
	import pandas as pd
	from numpy import isnan
	from warnings import warn

	pd.options.mode.chained_assignment = None
	# https://stackoverflow.com/questions/49728421/pandas-dataframe-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy
	# https://stackoverflow.com/questions/47182183/pandas-chained-assignment-warning-exception-handling

	PARTY_LIST = [
	'無',
	'合一行動聯盟',
	'中華統一促進黨',
	'親民黨',
	'安定力量',
	'台灣基進',
	'時代力量',
	'新黨',
	'喜樂島聯盟',
	'中國國民黨',
	'一邊一國行動黨',
	'勞動黨',
	'綠黨',
	'宗教聯盟',
	'民主進步黨',
	'台灣民眾黨',
	'台灣維新',
	'台澎黨',
	'國會政黨聯盟',
	'台灣團結聯盟'
	]

	def collapse_sncols(sheet, colname):
	# collapse same-name columns:
	# 找出 "大致" 同名的那些欄位，加總合併
	# sheet 的內容會被直接修改掉！
	all_colname = list(filter(lambda x: x[:-1] == colname, sheet.columns))
	if len(all_colname) == 1:
	sheet.rename( columns={all_colname[0]:colname}, inplace=True )
	elif len(all_colname) > 1:
	# sheet.insert(len(sheet.columns)-1, colname, 0)
	# 本來要插在 '選舉人數' 前面。現在 '選舉人數' 已搬到最前面
	sheet[colname] = 0
	for cn in all_colname:
	sheet[colname] += sheet[cn]
	sheet.drop(labels=all_colname, axis='columns', inplace=True)

	def sheet2csv(sheet, savepath='', prepend={}):
	allcn = list(sheet.columns)
	effective = list(filter(lambda x: 'Unnamed' in x, allcn))[3]
	# 找出「有效票數」欄位
	sheet.rename( columns={
	allcn[0]: '鄉鎮區',
	allcn[1]: '村里',
	effective: '有效票數',
	allcn[-2]: '選舉人數',
	}, inplace=True )
	allcn = list(sheet.columns)
	cut = sheet[['選舉人數','有效票數']]
	sheet.drop(['有效票數', '選舉人數'], axis='columns', inplace=True)
	sheet.insert(2, '選舉人數', cut['選舉人數'])
	sheet.insert(3, '有效票數', cut['有效票數'])
	# 把 '選舉人數' 欄移到前面。
	# 注意：用 sheet = sheet[新順序] 是無效的，
	# 因為 sheet 是 local variable
	rename_dict = {}
	ptid = { pt:ord('A') for pt in PARTY_LIST }
	# 一黨有多位候選人時，按 A,B,C, ... 設流水編號
	for cn in allcn:
	m = re.search(r'\(\d+\)\n.*?\n(\S+)', cn)
	if m:
	pt = m.group(1)
	if not pt in PARTY_LIST:
	warn('小黨 "{}" 以無黨計算'.format(pt))
	pt = '無'
	rename_dict[cn] = pt + chr(ptid[pt])
	ptid[pt] += 1
	sheet.rename( columns=rename_dict, inplace=True )
	sheet['鄉鎮區'].replace(r'　(.*)', r'\1', inplace=True, regex=True)
	sheet['鄉鎮區'].fillna(method='ffill', inplace=True)
	#keep_only = list(filter(lambda x: re.search(r'%', x), sheet.columns))
	#sheet = sheet[ keep_only ]
	to_drop = list(filter(lambda x: re.search(r'Unnamed', x), sheet.columns))
	sheet.drop(to_drop, axis='columns', inplace=True)
	#rename_dict = { cn: cn[1:] for cn in sheet.columns }
	#sheet.rename( columns=rename_dict, inplace=True )
	all_candidates = list( filter(
	lambda x: x[:-1] in PARTY_LIST,
	sheet.columns
	) )
	for col in all_candidates + ['選舉人數', '有效票數']:
	sheet[col] = sheet[col].astype(str).str.replace(",","")
	sheet[col] = sheet[col].replace(r'[^\d.]', '0', regex=True).astype(float).astype(int)
	sheet = sheet.dropna(subset=['鄉鎮區'])
	sheet['村里'].fillna('小計', inplace=True)
	for pt in PARTY_LIST:
	collapse_sncols(sheet, pt)

	sheet['驗算'] = 0
	for pt in PARTY_LIST:
	if pt in sheet.columns:
	sheet['驗算'] += sheet[pt]
	verify = sheet['驗算'] - sheet['有效票數']
	assert(sum(verify * verify)==0)
	sheet.drop(labels='驗算', axis='columns', inplace=True)

	allpk = list(prepend.keys())
	allpk.reverse()
	for pk in allpk:
	sheet.insert(0, pk, prepend[pk])
	#print(sheet)
	sheet.to_csv(savepath if savepath else sys.stdout, index=None, encoding='utf-8')


	###########################################################
	# main
	###########################################################

	parser = argparse.ArgumentParser(
	description='2020立委得票統計轉檔',
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-s', '--savepath', type=str, default='/tmp',
	help='儲存csv檔的目錄')
	parser.add_argument('xlsfiles', nargs='*', help='xls檔 xls ...')
	args = parser.parse_args()

	#all_xls_fn = os.listdir()
	#all_xls_fn = filter(lambda x: re.search('一覽表\(.*(縣\|市)\)\.xls$',x), all_xls_fn)
	#all_xls_fn = ['平地立委-A05-4-得票數一覽表(彰化縣).xls']
	#all_xls_fn = ['區域立委-A05-2-得票數一覽表(花蓮縣).xls']
	#all_xls_fn = ['區域立委-A05-2-得票數一覽表(臺中市).xls']
	all_xls_fn = args.xlsfiles
	for fn in all_xls_fn:
	m = re.search(r'(\S+?)-.?\((.?)\)', fn)
	elec_type, city = (m.group(1), m.group(2))
	all_sheets = pd.read_excel(fn, None, encoding='utf-8', skiprows=2, thousandsstr=',')
	all_sn = all_sheets.keys() # sheet names
	#all_sn =['臺中市第4選舉區']
	for sn in all_sn:
	print('{} ： {}'.format(fn, sn))
	sheet2csv(all_sheets[sn], prepend={'選票類型':elec_type, '縣市':city}, savepath=args.savepath+'/{}_{}.csv'.format(elec_type, sn))