Skip to content

Instantly share code, notes, and snippets.

@yaojenkuo
Last active December 13, 2020 08:05
Show Gist options
  • Save yaojenkuo/68164c0ae503c87923c2727ff018646b to your computer and use it in GitHub Desktop.
Save yaojenkuo/68164c0ae503c87923c2727ff018646b to your computer and use it in GitHub Desktop.
import pandas as pd
from urllib.parse import quote_plus
from string import ascii_uppercase
import re
class TaiwanElection2020:
def __init__(self):
self._counties = ['宜蘭縣', '彰化縣', '金門縣', '桃園市', '苗栗縣', '臺南市', '雲林縣', '南投縣', '高雄市', '臺北市', '新北市', '花蓮縣', '新竹市', '新竹縣', '基隆市', '連江縣', '嘉義縣', '嘉義市', '屏東縣', '澎湖縣', '臺東縣', '臺中市']
def tidy_dataframe(self, df):
# updating columns attributes
n_cols = df.columns.size
n_candidates = n_cols - 11
id_vars = ['town', 'village', 'office']
candidates = list(df.columns[3:(3 + n_candidates)])
office_cols = list(ascii_uppercase[:8])
col_names = id_vars + candidates + office_cols
df.columns = col_names
# forward-fill town values
filled_towns = df['town'].fillna(method='ffill')
df = df.assign(town=filled_towns)
# removing summations
df = df.dropna()
# removing extra spaces
stripped_towns = df['town'].str.replace("\u3000", "")
df = df.assign(town=stripped_towns)
# pivoting
df = df.drop(labels=office_cols, axis=1)
tidy_df = pd.melt(df,
id_vars=id_vars,
var_name='candidate_info',
value_name='votes'
)
return tidy_df
def adjust_presidential(self, df):
# split candidate information into 2 columns
candidate_info_df = df['candidate_info'].str.split("\n", expand=True)
numbers = candidate_info_df[0].str.replace("\(|\)", "")
candidates = candidate_info_df[1].str.cat(candidate_info_df[2], sep="/")
# re-arrange columns
df = df.drop(labels='candidate_info', axis=1)
df['number'] = numbers
df['candidate'] = candidates
df['office'] = df['office'].astype(int)
df = df[['county', 'town', 'village', 'office', 'number', 'candidate', 'votes']]
return df
def generate_presidential(self):
presidential = pd.DataFrame()
for county in self._counties:
file_name = "總統-A05-4-候選人得票數一覽表-各投開票所({}).xls".format(county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/presidential_2020/{}".format(file_url)
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',')
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
presidential = presidential.append(tidy_df)
print("Tidying {}".format(file_name))
presidential = presidential.reset_index(drop=True) # reset index for the appended dataframe
presidential_adjusted = self.adjust_presidential(presidential)
return presidential_adjusted
def adjust_legislative(self, df):
# split candidate information into 2 columns
candidate_info_df = df['candidate_info'].str.split("\n", expand=True)
numbers = candidate_info_df[0].str.replace("\(|\)", "")
candidates = candidate_info_df[1]
parties = candidate_info_df[2]
# re-arrange columns
df = df.drop(labels='candidate_info', axis=1)
df['number'] = numbers
df['candidate'] = candidates
df['party'] = parties
df['office'] = df['office'].astype(int)
return df
def generate_regional(self):
regional = pd.DataFrame()
for county in self._counties:
file_name = "區域立委-A05-2-得票數一覽表({}).xls".format(county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url)
xl = pd.ExcelFile(spreadsheet_url)
for sheet in xl.sheet_names:
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',', sheet_name=sheet)
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
tidy_df['electoral_district'] = sheet
regional = regional.append(tidy_df)
print("Tidying {} of {}".format(sheet, file_name))
regional = regional.reset_index(drop=True) # reset index for the appended dataframe
regional_adjusted = self.adjust_legislative(regional)
regional_adjusted = regional_adjusted[['county', 'electoral_district', 'town', 'village', 'office', 'number', 'party', 'candidate', 'votes']]
return regional_adjusted
def generate_indigenous(self):
indigenous = pd.DataFrame()
indigenous_types = ['山地', '平地']
for county in self._counties:
for indigenous_type in indigenous_types:
file_name = "{}立委-A05-4-得票數一覽表({}).xls".format(indigenous_type, county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url)
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',')
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
tidy_df['electoral_district'] = '{}原住民'.format(indigenous_type)
indigenous = indigenous.append(tidy_df)
print("Tidying {}".format(file_name))
indigenous = indigenous.reset_index(drop=True) # reset index for the appended dataframe
indigenous_adjusted = self.adjust_legislative(indigenous)
indigenous_adjusted = indigenous_adjusted[['county', 'electoral_district', 'town', 'village', 'office', 'number', 'party', 'candidate', 'votes']]
return indigenous_adjusted
def generate_legislative_regional(self):
regional = self.generate_regional()
indigenous = self.generate_indigenous()
legislative_regional = pd.concat([regional, indigenous], axis=0)
legislative_regional = legislative_regional.reset_index(drop=True)
return legislative_regional
def generate_legislative_at_large(self):
legislative_at_large = pd.DataFrame()
for county in self._counties:
file_name = "不分區立委-A05-6-得票數一覽表({}).xls".format(county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url)
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',')
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
legislative_at_large = legislative_at_large.append(tidy_df)
print("Tidying {}".format(file_name))
legislative_at_large = legislative_at_large.reset_index(drop=True) # reset index for the appended dataframe
legislative_at_large_adjusted = self.adjust_legislative(legislative_at_large)
legislative_at_large_adjusted = legislative_at_large_adjusted[['county', 'town', 'village', 'office', 'number', 'party', 'votes']]
return legislative_at_large_adjusted
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment