jkjung-avt/crawl_salaries.py

## crawl_salaries.py
"""crawl_salaries.py

For crawling average salaries of all public listed companies on Taiwan
Stock Exchange.  Data source: https://mops.twse.com.tw/mops/web/t100sb15

Source code reference:
https://blog.techbridge.cc/2019/07/26/how-to-use-taiwan-salary-data-to-do-python-data-analytics-and-data-visualization/

Usage:
$ python3 crawl_salaries.py

Outputs:
  108_sii.csv
  108_otc.csv
"""


from collections import OrderedDict

import requests
import pandas as pd
from bs4 import BeautifulSoup


class HTMLTableParser:

    def get_html_tables_from_resp(self, html_text):
        soup = BeautifulSoup(html_text, 'html.parser')
        tables = soup.find_all('table')
        return tables

    def parse_html_table(self, table):
        """
        <tr>
            <th align="center" class="tblHead" nowrap="" rowspan="2">產業類別</th>
            <th align="center" class="tblHead" nowrap="" rowspan="2">公司代號</th>
            <th align="center" class="tblHead" nowrap="" rowspan="2">公司名稱</th>
            <th align="center" class="tblHead" colspan="4" nowrap="">非擔任主管職務之<br/>全時員工資訊</th>
            <th align="center" class="tblHead" colspan="2" nowrap="">同業公司資訊</th>
            <th align="center" class="tblHead" colspan="4" nowrap="">薪資統計情形</th>
        </tr>
        <tr>
            <th align="center" class="tblHead" nowrap="">員工薪資總額(仟元)</th>
            <th align="center" class="tblHead" nowrap="">員工人數-加權平均(人)</th>
            <th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th>
            <th align="center" class="tblHead" nowrap="">每股盈餘(元/股)</th>
            <th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th>
            <th align="center" class="tblHead" nowrap="">平均每股盈餘(元/股)</th>
            <th align="center" class="tblHead" nowrap="">非經理人之<br/>全時員工薪資<br/>平均數未達50萬元</th>
            <th align="center" class="tblHead" nowrap="">公司EPS獲利表現較同業為佳<br/>，惟非經理人之全時員工<br/>薪資平均數低於同業水準</th>
            <th align="center" class="tblHead" nowrap="">公司EPS較前一年度成長<br/>，惟非經理人之全時員工<br/>薪資平均數較前一年度減少</th>
            <th align="center" class="tblHead" nowrap="">公司經營績效與員工薪酬<br/>之關聯性及合理性說明</th>
        </tr>
        <tr>
            <td nowrap="" style="text-align:left !important;">資訊服務業</td>
            <td nowrap="" style="text-align:left !important;">8416</td>
            <td nowrap="" style="text-align:left !important;">實威</td>
            <td nowrap="" style="text-align:right !important;"> 158,636 </td>
            <td nowrap="" style="text-align:right !important;"> 186 </td>
            <td nowrap="" style="text-align:right !important;"> 853 </td>
            <td nowrap="" style="text-align:right !important;"> 9.69 </td>
            <td nowrap="" style="text-align:right !important;"> 807 </td>
            <td nowrap="" style="text-align:right !important;"> 1.20 </td>
            <td nowrap="" style="text-align:right !important;"></td>
            <td nowrap="" style="text-align:right !important;"></td>
            <td nowrap="" style="text-align:right !important;"></td>
            <td nowrap="" style="text-align:left !important;"><br/></td>
        </tr>
        """
        parsed_data = []

        # Find number of rows and columns
        # we also find the column titles if we can
        table_row_tags = table.find_all('tr')
        table_header_tags = table.find_all('th')
        column_names = [table_header_tag.get_text() for key, table_header_tag in enumerate(table_header_tags) if key not in (3, 4, 5)]
        column_names[7] = '同業公司{}'.format(column_names[7])
        column_names[8] = '同業公司{}'.format(column_names[8])

        tr_td_tags = [
            [td_tag.get_text().strip() for td_tag in table_row.find_all('td')]
            for table_row in table_row_tags if table_row.find_all('td')
        ]

        parsed_data = [
            OrderedDict({
                column_names[index]: td_tag
                for index, td_tag in enumerate(tr_td_tag)
            })
            for tr_td_tag in tr_td_tags
        ]

        df = pd.DataFrame.from_dict(parsed_data)

        return df


def get_108_data(typek):
    """Get average salaries data for specified market in year 108 (2019).

    # Arguments
        typek: 'sii' (上市) or 'otc' (上櫃)

    # Returns
        A pandas dataframe containing the requested data
    """
    htlm_parser = HTMLTableParser()
    payload = {
        # 'encodeURIComponent': 1,
        'step': 1,
        'firstin': 1,
        'TYPEK': typek,
        'RYEAR': 108,
        'code': '',
    }
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

    resp = requests.post('https://mops.twse.com.tw/mops/web/ajax_t100sb15',
                         data=payload, headers=headers, timeout=2)

    html_tables = htlm_parser.get_html_tables_from_resp(resp.text)
    df_table = htlm_parser.parse_html_table(html_tables[0])

    # 產業類別,公司代號,公司名稱,員工薪資總額(仟元),員工人數-年度平均(人),
    # 員工薪資-平均數(仟元/人)-108年,員工薪資-平均數(仟元/人)-107年,
    # 員工薪資-中位數(仟元/人)-108年,員工薪資-中位數(仟元/人)-107年,
    # 平均每股盈餘(元/股),
    # 同業公司員工薪資-平均數(仟元/人),同業公司每股盈餘(元/股),
    # 非經理人之全時員工薪資平均數未達50萬元,
    # 公司EPS獲利表現較同業為佳，惟非經理人之全時員工薪資平均數低於同業水準,
    # 公司EPS較前一年度成長，惟非經理人之全時員工薪資平均數較前一年度減少,
    # 公司經營績效與員工薪酬之關聯性及合理性說明
    df_table.columns = [
        '產業類別', '公司代號', '公司名稱', '員工薪資總額', '員工人數-年度平均',
        '員工薪資-平均數-108年', '員工薪資-平均數-107年',
        '員工薪資-中位數-108年', '員工薪資-中位數-107年',
        '平均每股盈餘',
        '同業公司員工薪資-平均數', '同業公司每股盈餘',
        'is_under_50w', 'high_eps_low_salary',
        'growth_but_low_salary', 'low_salary_reason']

    # Convert salary numbers from strings into integers
    df_table['員工人數-年度平均'] = df_table['員工人數-年度平均'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int)
    df_table['員工薪資總額'] = df_table['員工薪資總額'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
    df_table['員工薪資-平均數-108年'] = df_table['員工薪資-平均數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
    df_table['員工薪資-平均數-107年'] = df_table['員工薪資-平均數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
    df_table['員工薪資-中位數-108年'] = df_table['員工薪資-中位數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
    df_table['員工薪資-中位數-107年'] = df_table['員工薪資-中位數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
    df_table['同業公司員工薪資-平均數'] = df_table['同業公司員工薪資-平均數'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
    df_table['平均每股盈餘'] = df_table['平均每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float)
    df_table['同業公司每股盈餘'] = df_table['同業公司每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float)

    return df_table


def main():
    df_sii = get_108_data('sii')
    df_sii.to_csv('108_sii.csv', index=False, encoding='utf-8')

    df_sii = get_108_data('otc')
    df_sii.to_csv('108_otc.csv', index=False, encoding='utf-8')


if __name__ == '__main__':
    main()

## google_image_links.py
"""google_image_links.py

For fetching image links google by querying a keyword.

# Prerequisite:
  1. Download "chromedriver" (same version as your Chrome browser).
     link: https://chromedriver.chromium.org/downloads
  2. Unzip the downloaded file and put "chromedriver" binary into ${HOME}/bin.
  2. `pip3 install selenium`
"""


import os
import selenium


DRIVER_PATH = os.environ['HOME'] + '/bin/chromedriver'
WEB_DRIVER = webdriver.Chrome(executable_path=DRIVER_PATH)


def google_image_links(keyword, max_links_to_fetch):
    def scroll_to_end(wd):
        wd.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(1)

    # build the google query
    search_url = ('https://www.google.com/search?safe=off&site=&tbm=isch&'
                  'source=hp&q={q}&oq={q}&gs_l=img')

    # load the page
    WEB_DRIVER.get(search_url.format(q=keyword))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(WEB_DRIVER)

        # get all image thumbnail results
        thumbnail_results = WEB_DRIVER.find_elements_by_css_selector('img.Q4LuWd')
        number_results = len(thumbnail_results)

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real
            # image behind it
            try:
                img.click()
                time.sleep(1)
            except Exception:
                continue

            # extract image urls
            actual_images = WEB_DRIVER.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if (actual_image.get_attribute('src') and
                    'http' in actual_image.get_attribute('src')):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if image_count >= max_links_to_fetch:
                break
        else:
            time.sleep(30)
            load_more_button = WEB_DRIVER.find_element_by_css_selector('.mye4qd')
            if load_more_button:
                WEB_DRIVER.execute_script('document.querySelector(".mye4qd").click();')

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return list(image_urls)
	"""crawl_salaries.py

	For crawling average salaries of all public listed companies on Taiwan
	Stock Exchange. Data source: https://mops.twse.com.tw/mops/web/t100sb15

	Source code reference:
	https://blog.techbridge.cc/2019/07/26/how-to-use-taiwan-salary-data-to-do-python-data-analytics-and-data-visualization/

	Usage:
	$ python3 crawl_salaries.py

	Outputs:
	108_sii.csv
	108_otc.csv
	"""


	from collections import OrderedDict

	import requests
	import pandas as pd
	from bs4 import BeautifulSoup


	class HTMLTableParser:

	def get_html_tables_from_resp(self, html_text):
	soup = BeautifulSoup(html_text, 'html.parser')
	tables = soup.find_all('table')
	return tables

	def parse_html_table(self, table):
	"""
	<tr>
	<th align="center" class="tblHead" nowrap="" rowspan="2">產業類別</th>
	<th align="center" class="tblHead" nowrap="" rowspan="2">公司代號</th>
	<th align="center" class="tblHead" nowrap="" rowspan="2">公司名稱</th>
	<th align="center" class="tblHead" colspan="4" nowrap="">非擔任主管職務之<br/>全時員工資訊</th>
	<th align="center" class="tblHead" colspan="2" nowrap="">同業公司資訊</th>
	<th align="center" class="tblHead" colspan="4" nowrap="">薪資統計情形</th>
	</tr>
	<tr>
	<th align="center" class="tblHead" nowrap="">員工薪資總額(仟元)</th>
	<th align="center" class="tblHead" nowrap="">員工人數-加權平均(人)</th>
	<th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th>
	<th align="center" class="tblHead" nowrap="">每股盈餘(元/股)</th>
	<th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th>
	<th align="center" class="tblHead" nowrap="">平均每股盈餘(元/股)</th>
	<th align="center" class="tblHead" nowrap="">非經理人之<br/>全時員工薪資<br/>平均數未達50萬元</th>
	<th align="center" class="tblHead" nowrap="">公司EPS獲利表現較同業為佳<br/>，惟非經理人之全時員工<br/>薪資平均數低於同業水準</th>
	<th align="center" class="tblHead" nowrap="">公司EPS較前一年度成長<br/>，惟非經理人之全時員工<br/>薪資平均數較前一年度減少</th>
	<th align="center" class="tblHead" nowrap="">公司經營績效與員工薪酬<br/>之關聯性及合理性說明</th>
	</tr>
	<tr>
	<td nowrap="" style="text-align:left !important;">資訊服務業</td>
	<td nowrap="" style="text-align:left !important;">8416</td>
	<td nowrap="" style="text-align:left !important;">實威</td>
	<td nowrap="" style="text-align:right !important;"> 158,636 </td>
	<td nowrap="" style="text-align:right !important;"> 186 </td>
	<td nowrap="" style="text-align:right !important;"> 853 </td>
	<td nowrap="" style="text-align:right !important;"> 9.69 </td>
	<td nowrap="" style="text-align:right !important;"> 807 </td>
	<td nowrap="" style="text-align:right !important;"> 1.20 </td>
	<td nowrap="" style="text-align:right !important;"></td>
	<td nowrap="" style="text-align:right !important;"></td>
	<td nowrap="" style="text-align:right !important;"></td>
	<td nowrap="" style="text-align:left !important;"><br/></td>
	</tr>
	"""
	parsed_data = []

	# Find number of rows and columns
	# we also find the column titles if we can
	table_row_tags = table.find_all('tr')
	table_header_tags = table.find_all('th')
	column_names = [table_header_tag.get_text() for key, table_header_tag in enumerate(table_header_tags) if key not in (3, 4, 5)]
	column_names[7] = '同業公司{}'.format(column_names[7])
	column_names[8] = '同業公司{}'.format(column_names[8])

	tr_td_tags = [
	[td_tag.get_text().strip() for td_tag in table_row.find_all('td')]
	for table_row in table_row_tags if table_row.find_all('td')
	]

	parsed_data = [
	OrderedDict({
	column_names[index]: td_tag
	for index, td_tag in enumerate(tr_td_tag)
	})
	for tr_td_tag in tr_td_tags
	]

	df = pd.DataFrame.from_dict(parsed_data)

	return df


	def get_108_data(typek):
	"""Get average salaries data for specified market in year 108 (2019).

	# Arguments
	typek: 'sii' (上市) or 'otc' (上櫃)

	# Returns
	A pandas dataframe containing the requested data
	"""
	htlm_parser = HTMLTableParser()
	payload = {
	# 'encodeURIComponent': 1,
	'step': 1,
	'firstin': 1,
	'TYPEK': typek,
	'RYEAR': 108,
	'code': '',
	}
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

	resp = requests.post('https://mops.twse.com.tw/mops/web/ajax_t100sb15',
	data=payload, headers=headers, timeout=2)

	html_tables = htlm_parser.get_html_tables_from_resp(resp.text)
	df_table = htlm_parser.parse_html_table(html_tables[0])

	# 產業類別,公司代號,公司名稱,員工薪資總額(仟元),員工人數-年度平均(人),
	# 員工薪資-平均數(仟元/人)-108年,員工薪資-平均數(仟元/人)-107年,
	# 員工薪資-中位數(仟元/人)-108年,員工薪資-中位數(仟元/人)-107年,
	# 平均每股盈餘(元/股),
	# 同業公司員工薪資-平均數(仟元/人),同業公司每股盈餘(元/股),
	# 非經理人之全時員工薪資平均數未達50萬元,
	# 公司EPS獲利表現較同業為佳，惟非經理人之全時員工薪資平均數低於同業水準,
	# 公司EPS較前一年度成長，惟非經理人之全時員工薪資平均數較前一年度減少,
	# 公司經營績效與員工薪酬之關聯性及合理性說明
	df_table.columns = [
	'產業類別', '公司代號', '公司名稱', '員工薪資總額', '員工人數-年度平均',
	'員工薪資-平均數-108年', '員工薪資-平均數-107年',
	'員工薪資-中位數-108年', '員工薪資-中位數-107年',
	'平均每股盈餘',
	'同業公司員工薪資-平均數', '同業公司每股盈餘',
	'is_under_50w', 'high_eps_low_salary',
	'growth_but_low_salary', 'low_salary_reason']

	# Convert salary numbers from strings into integers
	df_table['員工人數-年度平均'] = df_table['員工人數-年度平均'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int)
	df_table['員工薪資總額'] = df_table['員工薪資總額'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
	df_table['員工薪資-平均數-108年'] = df_table['員工薪資-平均數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
	df_table['員工薪資-平均數-107年'] = df_table['員工薪資-平均數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
	df_table['員工薪資-中位數-108年'] = df_table['員工薪資-中位數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
	df_table['員工薪資-中位數-107年'] = df_table['員工薪資-中位數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
	df_table['同業公司員工薪資-平均數'] = df_table['同業公司員工薪資-平均數'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000
	df_table['平均每股盈餘'] = df_table['平均每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float)
	df_table['同業公司每股盈餘'] = df_table['同業公司每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float)

	return df_table


	def main():
	df_sii = get_108_data('sii')
	df_sii.to_csv('108_sii.csv', index=False, encoding='utf-8')

	df_sii = get_108_data('otc')
	df_sii.to_csv('108_otc.csv', index=False, encoding='utf-8')


	if __name__ == '__main__':
	main()
	"""google_image_links.py

	For fetching image links google by querying a keyword.

	# Prerequisite:
	1. Download "chromedriver" (same version as your Chrome browser).
	link: https://chromedriver.chromium.org/downloads
	2. Unzip the downloaded file and put "chromedriver" binary into ${HOME}/bin.
	2. `pip3 install selenium`
	"""


	import os
	import selenium


	DRIVER_PATH = os.environ['HOME'] + '/bin/chromedriver'
	WEB_DRIVER = webdriver.Chrome(executable_path=DRIVER_PATH)


	def google_image_links(keyword, max_links_to_fetch):
	def scroll_to_end(wd):
	wd.execute_script('window.scrollTo(0, document.body.scrollHeight);')
	time.sleep(1)

	# build the google query
	search_url = ('https://www.google.com/search?safe=off&site=&tbm=isch&'
	'source=hp&q={q}&oq={q}&gs_l=img')

	# load the page
	WEB_DRIVER.get(search_url.format(q=keyword))

	image_urls = set()
	image_count = 0
	results_start = 0
	while image_count < max_links_to_fetch:
	scroll_to_end(WEB_DRIVER)

	# get all image thumbnail results
	thumbnail_results = WEB_DRIVER.find_elements_by_css_selector('img.Q4LuWd')
	number_results = len(thumbnail_results)

	for img in thumbnail_results[results_start:number_results]:
	# try to click every thumbnail such that we can get the real
	# image behind it
	try:
	img.click()
	time.sleep(1)
	except Exception:
	continue

	# extract image urls
	actual_images = WEB_DRIVER.find_elements_by_css_selector('img.n3VNCb')
	for actual_image in actual_images:
	if (actual_image.get_attribute('src') and
	'http' in actual_image.get_attribute('src')):
	image_urls.add(actual_image.get_attribute('src'))

	image_count = len(image_urls)

	if image_count >= max_links_to_fetch:
	break
	else:
	time.sleep(30)
	load_more_button = WEB_DRIVER.find_element_by_css_selector('.mye4qd')
	if load_more_button:
	WEB_DRIVER.execute_script('document.querySelector(".mye4qd").click();')

	# move the result startpoint further down
	results_start = len(thumbnail_results)

	return list(image_urls)