Created
August 19, 2017 00:11
-
-
Save Azoay/084a2d5ea85241bed1f6856db6dc1d2a to your computer and use it in GitHub Desktop.
amazonランキング100個分を取得
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import requests | |
import time | |
import json | |
from bs4 import BeautifulSoup | |
l_title = [] | |
l_asin = [] | |
def scrape_title(htmltext): | |
global l_title, l_asin | |
soup = BeautifulSoup(htmltext, 'lxml') | |
lists = soup.find_all(class_="zg_itemRow") | |
for l in lists: | |
# asin | |
x = json.loads(l.find("div")['data-p13n-asin-metadata']) | |
y = x['asin'] | |
l_asin.append(y) | |
# title | |
x = l.find_all("a")[1].find("div").string | |
y = x.strip() | |
l_title.append(y) | |
def main(url): | |
for p in range(1, 6): | |
htmltext = requests.get(url+"#{}".format(p)).text | |
scrape_title(htmltext) | |
time.sleep(1.0) | |
global l_title, l_asin | |
ret_list = l_title, l_asin | |
ret = ['no,title,asin\n'] | |
for count in range(0,len(l_title)): | |
tmp = "No." + str(count+1).rjust(2) + ' ' + ret_list[0][count] + ' ' + ret_list[1][count] | |
print(tmp) | |
tmp = "{},{},{}\n".format(count+1, ret_list[0][count], ret_list[1][count]) | |
ret.append(tmp) | |
with open("ret.csv","w") as f: | |
f.writelines(ret) | |
if __name__ == "__main__": | |
# book | |
url = "https://www.amazon.co.jp/gp/bestsellers/books/" | |
# home & kitchen | |
#url = "https://www.amazon.co.jp/gp/movers-and-shakers/kitchen/ref=zg_bsms_kitchen_home_all?pf_rd_p=3d910440-0455-48cb-a0f9-58ba9fadf260&pf_rd_s=center-1&pf_rd_t=2301&pf_rd_i=home&pf_rd_m=AN1VRQENFRJN5&pf_rd_r=0VA7VC8KD6BD1YMXWFR8&pf_rd_r=0VA7VC8KD6BD1YMXWFR8&pf_rd_p=3d910440-0455-48cb-a0f9-58ba9fadf260" | |
main(url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. urlのところに適当なリンクを載せる | |
2. 実行すると、 | |
No. 1 blt graph. vol.22(2017 AUG―写真集クオリティーのグラビア&インタビュー新型マガ 透きとおる、summer memory 桜井日奈子 (TOKYO NEWS MOOK 634号) 4863366663 | |
No. 2 ネコノヒー 1 4047348872 | |
No. 3 おもしろい!進化のふしぎ ざんねんないきもの事典 4471103644 | |
No. 4 オーバーロード12 聖王国の聖騎士 4047348457 | |
No. 5 女40歳から体が若くなる食べ方 (単行本) 4837926843 | |
てな具合にランキング順にタイトルとASIN(=アマゾンの商品管理IDらしい)が出力される。res.csvにも出力される。 | |
必要に応じて改変したらいい。 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment