Skip to content

Instantly share code, notes, and snippets.

@jkjung-avt
Last active July 18, 2022 09:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jkjung-avt/dd40526ba97665cfbf847858fdd52e12 to your computer and use it in GitHub Desktop.
Save jkjung-avt/dd40526ba97665cfbf847858fdd52e12 to your computer and use it in GitHub Desktop.
從公開資訊觀測站下載108年度台灣上市、上櫃公司 "非擔任主管職務之全時員工薪資資訊"
"""crawl_salaries.py
For crawling average salaries of all public listed companies on Taiwan
Stock Exchange. Data source: https://mops.twse.com.tw/mops/web/t100sb15
Source code reference:
https://blog.techbridge.cc/2019/07/26/how-to-use-taiwan-salary-data-to-do-python-data-analytics-and-data-visualization/
Usage:
$ python3 crawl_salaries.py
Outputs:
108_sii.csv
108_otc.csv
"""
from collections import OrderedDict
import requests
import pandas as pd
from bs4 import BeautifulSoup
class HTMLTableParser:
def get_html_tables_from_resp(self, html_text):
soup = BeautifulSoup(html_text, 'html.parser')
tables = soup.find_all('table')
return tables
def parse_html_table(self, table):
"""
<tr>
<th align="center" class="tblHead" nowrap="" rowspan="2">產業類別</th>
<th align="center" class="tblHead" nowrap="" rowspan="2">公司代號</th>
<th align="center" class="tblHead" nowrap="" rowspan="2">公司名稱</th>
<th align="center" class="tblHead" colspan="4" nowrap="">非擔任主管職務之<br/>全時員工資訊</th>
<th align="center" class="tblHead" colspan="2" nowrap="">同業公司資訊</th>
<th align="center" class="tblHead" colspan="4" nowrap="">薪資統計情形</th>
</tr>
<tr>
<th align="center" class="tblHead" nowrap="">員工薪資總額(仟元)</th>
<th align="center" class="tblHead" nowrap="">員工人數-加權平均(人)</th>
<th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th>
<th align="center" class="tblHead" nowrap="">每股盈餘(元/股)</th>
<th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th>
<th align="center" class="tblHead" nowrap="">平均每股盈餘(元/股)</th>
<th align="center" class="tblHead" nowrap="">非經理人之<br/>全時員工薪資<br/>平均數未達50萬元</th>
<th align="center" class="tblHead" nowrap="">公司EPS獲利表現較同業為佳<br/>,惟非經理人之全時員工<br/>薪資平均數低於同業水準</th>
<th align="center" class="tblHead" nowrap="">公司EPS較前一年度成長<br/>,惟非經理人之全時員工<br/>薪資平均數較前一年度減少</th>
<th align="center" class="tblHead" nowrap="">公司經營績效與員工薪酬<br/>之關聯性及合理性說明</th>
</tr>
<tr>
<td nowrap="" style="text-align:left !important;">資訊服務業</td>
<td nowrap="" style="text-align:left !important;">8416</td>
<td nowrap="" style="text-align:left !important;">實威</td>
<td nowrap="" style="text-align:right !important;"> 158,636 </td>
<td nowrap="" style="text-align:right !important;"> 186 </td>
<td nowrap="" style="text-align:right !important;"> 853 </td>
<td nowrap="" style="text-align:right !important;"> 9.69 </td>
<td nowrap="" style="text-align:right !important;"> 807 </td>
<td nowrap="" style="text-align:right !important;"> 1.20 </td>
<td nowrap="" style="text-align:right !important;"></td>
<td nowrap="" style="text-align:right !important;"></td>
<td nowrap="" style="text-align:right !important;"></td>
<td nowrap="" style="text-align:left !important;"><br/></td>
</tr>
"""
parsed_data = []
# Find number of rows and columns
# we also find the column titles if we can
table_row_tags = table.find_all('tr')
table_header_tags = table.find_all('th')
column_names = [table_header_tag.get_text() for key, table_header_tag in enumerate(table_header_tags) if key not in (3, 4, 5)]
column_names[7] = '同業公司{}'.format(column_names[7])
column_names[8] = '同業公司{}'.format(column_names[8])
tr_td_tags = [
[td_tag.get_text().strip() for td_tag in table_row.find_all('td')]
for table_row in table_row_tags if table_row.find_all('td')
]
parsed_data = [
OrderedDict({
column_names[index]: td_tag
for index, td_tag in enumerate(tr_td_tag)
})
for tr_td_tag in tr_td_tags
]
df = pd.DataFrame.from_dict(parsed_data)
return df
def get_108_data(typek):
"""Get average salaries data for specified market in year 108 (2019).
# Arguments
typek: 'sii' (上市) or 'otc' (上櫃)
# Returns
A pandas dataframe containing the requested data
"""
htlm_parser = HTMLTableParser()
payload = {
# 'encodeURIComponent': 1,
'step': 1,
'firstin': 1,
'TYPEK': typek,
'RYEAR': 108,
'code': '',
}
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
resp = requests.post('https://mops.twse.com.tw/mops/web/ajax_t100sb15',
data=payload, headers=headers, timeout=2)
html_tables = htlm_parser.get_html_tables_from_resp(resp.text)
df_table = htlm_parser.parse_html_table(html_tables[0])
# 產業類別,公司代號,公司名稱,員工薪資總額(仟元),員工人數-年度平均(人),
# 員工薪資-平均數(仟元/人)-108年,員工薪資-平均數(仟元/人)-107年,
# 員工薪資-中位數(仟元/人)-108年,員工薪資-中位數(仟元/人)-107年,
# 平均每股盈餘(元/股),
# 同業公司員工薪資-平均數(仟元/人),同業公司每股盈餘(元/股),
# 非經理人之全時員工薪資平均數未達50萬元,
# 公司EPS獲利表現較同業為佳,惟非經理人之全時員工薪資平均數低於同業水準,
# 公司EPS較前一年度成長,惟非經理人之全時員工薪資平均數較前一年度減少,
# 公司經營績效與員工薪酬之關聯性及合理性說明
df_table.columns = [
'產業類別', '公司代號', '公司名稱', '員工薪資總額', '員工人數-年度平均',
'員工薪資-平均數-108年', '員工薪資-平均數-107年',
'員工薪資-中位數-108年', '員工薪資-中位數-107年',
'平均每股盈餘',
'同業公司員工薪資-平均數', '同業公司每股盈餘',
'is_under_50w', 'high_eps_low_salary',
'growth_but_low_salary', 'low_salary_reason']
# Convert salary numbers from strings into integers
df_table['員工人數-年度平均'] = df_table['員工人數-年度平均'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int)
df_table['員工薪資總額'] = df_table['員工薪資總額'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
df_table['員工薪資-平均數-108年'] = df_table['員工薪資-平均數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
df_table['員工薪資-平均數-107年'] = df_table['員工薪資-平均數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
df_table['員工薪資-中位數-108年'] = df_table['員工薪資-中位數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
df_table['員工薪資-中位數-107年'] = df_table['員工薪資-中位數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
df_table['同業公司員工薪資-平均數'] = df_table['同業公司員工薪資-平均數'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
df_table['平均每股盈餘'] = df_table['平均每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float)
df_table['同業公司每股盈餘'] = df_table['同業公司每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float)
return df_table
def main():
df_sii = get_108_data('sii')
df_sii.to_csv('108_sii.csv', index=False, encoding='utf-8')
df_sii = get_108_data('otc')
df_sii.to_csv('108_otc.csv', index=False, encoding='utf-8')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment