Last active
December 13, 2020 08:05
-
-
Save yaojenkuo/68164c0ae503c87923c2727ff018646b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from urllib.parse import quote_plus | |
from string import ascii_uppercase | |
import re | |
class TaiwanElection2020: | |
def __init__(self): | |
self._counties = ['宜蘭縣', '彰化縣', '金門縣', '桃園市', '苗栗縣', '臺南市', '雲林縣', '南投縣', '高雄市', '臺北市', '新北市', '花蓮縣', '新竹市', '新竹縣', '基隆市', '連江縣', '嘉義縣', '嘉義市', '屏東縣', '澎湖縣', '臺東縣', '臺中市'] | |
def tidy_dataframe(self, df): | |
# updating columns attributes | |
n_cols = df.columns.size | |
n_candidates = n_cols - 11 | |
id_vars = ['town', 'village', 'office'] | |
candidates = list(df.columns[3:(3 + n_candidates)]) | |
office_cols = list(ascii_uppercase[:8]) | |
col_names = id_vars + candidates + office_cols | |
df.columns = col_names | |
# forward-fill town values | |
filled_towns = df['town'].fillna(method='ffill') | |
df = df.assign(town=filled_towns) | |
# removing summations | |
df = df.dropna() | |
# removing extra spaces | |
stripped_towns = df['town'].str.replace("\u3000", "") | |
df = df.assign(town=stripped_towns) | |
# pivoting | |
df = df.drop(labels=office_cols, axis=1) | |
tidy_df = pd.melt(df, | |
id_vars=id_vars, | |
var_name='candidate_info', | |
value_name='votes' | |
) | |
return tidy_df | |
def adjust_presidential(self, df): | |
# split candidate information into 2 columns | |
candidate_info_df = df['candidate_info'].str.split("\n", expand=True) | |
numbers = candidate_info_df[0].str.replace("\(|\)", "") | |
candidates = candidate_info_df[1].str.cat(candidate_info_df[2], sep="/") | |
# re-arrange columns | |
df = df.drop(labels='candidate_info', axis=1) | |
df['number'] = numbers | |
df['candidate'] = candidates | |
df['office'] = df['office'].astype(int) | |
df = df[['county', 'town', 'village', 'office', 'number', 'candidate', 'votes']] | |
return df | |
def generate_presidential(self): | |
presidential = pd.DataFrame() | |
for county in self._counties: | |
file_name = "總統-A05-4-候選人得票數一覽表-各投開票所({}).xls".format(county) | |
file_url = quote_plus(file_name) | |
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/presidential_2020/{}".format(file_url) | |
# skip those combined cells | |
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',') | |
tidy_df = self.tidy_dataframe(df) | |
# appending dataframe of each city/county | |
tidy_df['county'] = county | |
presidential = presidential.append(tidy_df) | |
print("Tidying {}".format(file_name)) | |
presidential = presidential.reset_index(drop=True) # reset index for the appended dataframe | |
presidential_adjusted = self.adjust_presidential(presidential) | |
return presidential_adjusted | |
def adjust_legislative(self, df): | |
# split candidate information into 2 columns | |
candidate_info_df = df['candidate_info'].str.split("\n", expand=True) | |
numbers = candidate_info_df[0].str.replace("\(|\)", "") | |
candidates = candidate_info_df[1] | |
parties = candidate_info_df[2] | |
# re-arrange columns | |
df = df.drop(labels='candidate_info', axis=1) | |
df['number'] = numbers | |
df['candidate'] = candidates | |
df['party'] = parties | |
df['office'] = df['office'].astype(int) | |
return df | |
def generate_regional(self): | |
regional = pd.DataFrame() | |
for county in self._counties: | |
file_name = "區域立委-A05-2-得票數一覽表({}).xls".format(county) | |
file_url = quote_plus(file_name) | |
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url) | |
xl = pd.ExcelFile(spreadsheet_url) | |
for sheet in xl.sheet_names: | |
# skip those combined cells | |
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',', sheet_name=sheet) | |
tidy_df = self.tidy_dataframe(df) | |
# appending dataframe of each city/county | |
tidy_df['county'] = county | |
tidy_df['electoral_district'] = sheet | |
regional = regional.append(tidy_df) | |
print("Tidying {} of {}".format(sheet, file_name)) | |
regional = regional.reset_index(drop=True) # reset index for the appended dataframe | |
regional_adjusted = self.adjust_legislative(regional) | |
regional_adjusted = regional_adjusted[['county', 'electoral_district', 'town', 'village', 'office', 'number', 'party', 'candidate', 'votes']] | |
return regional_adjusted | |
def generate_indigenous(self): | |
indigenous = pd.DataFrame() | |
indigenous_types = ['山地', '平地'] | |
for county in self._counties: | |
for indigenous_type in indigenous_types: | |
file_name = "{}立委-A05-4-得票數一覽表({}).xls".format(indigenous_type, county) | |
file_url = quote_plus(file_name) | |
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url) | |
# skip those combined cells | |
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',') | |
tidy_df = self.tidy_dataframe(df) | |
# appending dataframe of each city/county | |
tidy_df['county'] = county | |
tidy_df['electoral_district'] = '{}原住民'.format(indigenous_type) | |
indigenous = indigenous.append(tidy_df) | |
print("Tidying {}".format(file_name)) | |
indigenous = indigenous.reset_index(drop=True) # reset index for the appended dataframe | |
indigenous_adjusted = self.adjust_legislative(indigenous) | |
indigenous_adjusted = indigenous_adjusted[['county', 'electoral_district', 'town', 'village', 'office', 'number', 'party', 'candidate', 'votes']] | |
return indigenous_adjusted | |
def generate_legislative_regional(self): | |
regional = self.generate_regional() | |
indigenous = self.generate_indigenous() | |
legislative_regional = pd.concat([regional, indigenous], axis=0) | |
legislative_regional = legislative_regional.reset_index(drop=True) | |
return legislative_regional | |
def generate_legislative_at_large(self): | |
legislative_at_large = pd.DataFrame() | |
for county in self._counties: | |
file_name = "不分區立委-A05-6-得票數一覽表({}).xls".format(county) | |
file_url = quote_plus(file_name) | |
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url) | |
# skip those combined cells | |
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',') | |
tidy_df = self.tidy_dataframe(df) | |
# appending dataframe of each city/county | |
tidy_df['county'] = county | |
legislative_at_large = legislative_at_large.append(tidy_df) | |
print("Tidying {}".format(file_name)) | |
legislative_at_large = legislative_at_large.reset_index(drop=True) # reset index for the appended dataframe | |
legislative_at_large_adjusted = self.adjust_legislative(legislative_at_large) | |
legislative_at_large_adjusted = legislative_at_large_adjusted[['county', 'town', 'village', 'office', 'number', 'party', 'votes']] | |
return legislative_at_large_adjusted |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment