# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import json import mojimoji import re import datetime def makejson(page_num = 1): # JSON ''' { "date": "7/13 15:00", "data": [ { "name": "会社名", "hp_url": "会社のURL", "yf_url": "会社のヤフーファイナンスURL" "feature": "特色" }, {}, ..., {} ] } ''' ret = {} ret["date"] = datetime.datetime.now().strftime("%m/%d %H:%M") ret["data"] = [] # 指定ページまで繰り返す for page in range(1,page_num+1): # ヤフーファイナンスの設立年月日ランキングのページを取得 urlstr = 'https://finance.yahoo.co.jp/stocks/ranking/listingDate?market=all&term=daily&page=' + str(page) html_text = requests.get(urlstr).text soup = BeautifulSoup(html_text, 'html.parser') # テーブルを取得 for selecttable in soup.find_all("table"): if "順位" in selecttable.text: table = selecttable break # アンカーリストを取得 for ancs in table.find_all("a"): # 掲示板のアンカーを排除 if "掲示板" not in ancs: #------------------------------ # 各会社の基本情報を取得 #------------------------------ company = {} company["name"] = mojimoji.zen_to_han(ancs.text, kana=False).replace("&","&") company["yf_url"] = ancs.attrs["href"] #------------------------------ # 特色を取得 #------------------------------ company_code = re.sub(".*/", "", company["yf_url"]) urlstr2 = "https://finance.yahoo.co.jp/quote/" + company_code + "/profile" html_text2 = requests.get(urlstr2).text soup2 = BeautifulSoup(html_text2, 'html.parser') # 特色を取得 company["feature"] = "" for selecttable in soup2.find_all("table"): if "特色" in selecttable.text or "概要" in selecttable.text: for selecttr in selecttable.find_all("tr"): if "特色" in selecttr.text or "概要" in selecttr.text: company["feature"] = selecttr.find_all("td")[0].text.replace("【特色】", "") break break # 特色が未反映の会社は除外する if company["feature"] == "---": continue #------------------------------ # 会社URLを取得 #------------------------------ urlstr3 = "https://www.google.com/search?q=" + company["name"] + "+hp" html_text3 = requests.get(urlstr3).text soup3 = BeautifulSoup(html_text3, 'html.parser') # 会社URLを取得 company["hp_url"] = "" for selectdiv in soup3.select("div > a"): href = selectdiv.attrs["href"] if href.startswith("/url?q=") and "google" not in href and "wikipedia" not in href: tmpstr = href.replace("/url?q=", "") tmpstr = re.sub("&sa=U.*", "", tmpstr) company["hp_url"] = tmpstr break #------------------------------ # 標準出力 #------------------------------ print(company["name"]) print(company["hp_url"]) print(company["yf_url"]) print(company["feature"]) print("----------------------") ret["data"].append(company) # JSON出力 f = open("dynamic/youngstock.json", "w", encoding='utf-8') f.write(json.dumps(ret, ensure_ascii=False)) f.close() if __name__ == "__main__": # JSONファイル作成 makejson(page_num=15)