Last active
January 2, 2019 15:34
-
-
Save tkota0726/6fe92cfdc3c403535578a341883e6275 to your computer and use it in GitHub Desktop.
rakuten.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# !/usr/bin/env/python | |
# coding: utf-8 | |
from urllib.request import urlretrieve | |
from urllib.request import urlopen | |
from urllib.error import HTTPError | |
from bs4 import BeautifulSoup | |
from time import time | |
import requests, random, datetime, re, os, json | |
import time | |
import json | |
import sys | |
import pandas as pd | |
sys.setrecursionlimit(10000) | |
# 200162 本・雑誌・コミック | |
mainurl = "http://product.rakuten.co.jp" | |
downloadDirectory = "./downloads" | |
index_page_list = [] | |
title_list = [] | |
summary_url_list = [] | |
img_list = [] | |
dict = {} | |
img = [] | |
l = [] | |
i = 0 | |
num=0 | |
filename = input('choose filename--> ') | |
pages = int(input('how many pages?--> ')) | |
index_url = mainurl + "/200162/?rdate=0&rev=0&st=1&s=1&p=" | |
def main(): | |
for i in range(pages): | |
index_page = pageget(index_page_list, i) | |
print('\033[35m', ">>") | |
print('\033[35m', index_page) | |
print('\033[35m', "<<") | |
try: | |
# get soup | |
data(index_page) | |
except Exception as e: | |
print(e) | |
# get title&url | |
titleurl_get(index_page_list, data(index_page), num) | |
# json to csv | |
json2csv() | |
print('\033[31m','\033[42m','\033[1m', "Create csv [SUCCESS]!!", '\033[0m') | |
def pageget(index_page_list, i): | |
index_page_list = index_url + str(i+1) | |
time.sleep(0.5) | |
return index_page_list | |
def data(index_page): | |
# urlをsoupに | |
result = requests.get(index_page) | |
# print(index_page) | |
c = result.content | |
soup = BeautifulSoup(c,"html.parser") | |
return soup | |
def titleurl_get(index_page_list, soup, num): | |
# get title&url | |
items = soup.find_all('div',{'class':'proListItemName'}) | |
for title in items: | |
num+=1 | |
id = title.a.get("href")[-32:-1] | |
url = title.a.get("href") | |
product_name = title.text.strip().rstrip('\n 比較') | |
print('\033[94m',"product_id: ", '\033[33m', id) | |
print('\033[94m',"product_url: ", '\033[33m', url) | |
print('\033[94m',"No.", num, "product_name: ",'\033[33m', product_name,'\033[0m') | |
img_url = imgurl_get(index_page_list, soup, num, id) | |
print('\033[94m',"img_url: ", '\033[33m',img_url) | |
time.sleep(1) | |
print('\033[36m',">-------------------------------<") | |
# get img file | |
try: | |
get_img(id, img_url, downloadDirectory) | |
print('\033[31m','\033[42m','\033[1m', "get_img success!!", '\033[0m') | |
except Exception as e: | |
print(e) | |
# make json file | |
dict.update({id:{ | |
"id":id, "img_url":img_url, "product_name":product_name, "url":url | |
}}) | |
f = open('%s.json'% filename, 'w') | |
json.dump(dict, f, indent=4, ensure_ascii=False) | |
print('\033[31m','\033[42m','\033[1m', "crate json success!!", '\033[0m') | |
def imgurl_get(index_page_list, soup, num, id): | |
# get title&url | |
imgs = soup.find_all('a', href=re.compile(id)) | |
for img in imgs: | |
num+=1 | |
img_u = img.find("img",alt="") | |
img_url = img_u['src'] | |
return img_url | |
def get_img(id, img_url, downDirectory): | |
import subprocess | |
cmd = "wget -O ./downloads/%s.jpg %s" %(id, img_url) | |
subprocess.call( cmd.strip().split(" ") ) | |
def json2csv(): | |
with open('%s.json'% filename, 'r') as f : | |
data_json = json.load(f) | |
json_pd = pd.DataFrame(data_json) | |
json_pd_t = json_pd.T | |
json_pd_t.to_csv("%s.csv"% filename) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment