Skip to content

Instantly share code, notes, and snippets.

@tkota0726
Last active January 2, 2019 15:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tkota0726/6fe92cfdc3c403535578a341883e6275 to your computer and use it in GitHub Desktop.
Save tkota0726/6fe92cfdc3c403535578a341883e6275 to your computer and use it in GitHub Desktop.
rakuten.py
# -*- coding: utf-8 -*-
# !/usr/bin/env/python
# coding: utf-8
from urllib.request import urlretrieve
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
from time import time
import requests, random, datetime, re, os, json
import time
import json
import sys
import pandas as pd
sys.setrecursionlimit(10000)
# 200162 本・雑誌・コミック
mainurl = "http://product.rakuten.co.jp"
downloadDirectory = "./downloads"
index_page_list = []
title_list = []
summary_url_list = []
img_list = []
dict = {}
img = []
l = []
i = 0
num=0
filename = input('choose filename--> ')
pages = int(input('how many pages?--> '))
index_url = mainurl + "/200162/?rdate=0&rev=0&st=1&s=1&p="
def main():
for i in range(pages):
index_page = pageget(index_page_list, i)
print('\033[35m', ">>")
print('\033[35m', index_page)
print('\033[35m', "<<")
try:
# get soup
data(index_page)
except Exception as e:
print(e)
# get title&url
titleurl_get(index_page_list, data(index_page), num)
# json to csv
json2csv()
print('\033[31m','\033[42m','\033[1m', "Create csv [SUCCESS]!!", '\033[0m')
def pageget(index_page_list, i):
index_page_list = index_url + str(i+1)
time.sleep(0.5)
return index_page_list
def data(index_page):
# urlをsoupに
result = requests.get(index_page)
# print(index_page)
c = result.content
soup = BeautifulSoup(c,"html.parser")
return soup
def titleurl_get(index_page_list, soup, num):
# get title&url
items = soup.find_all('div',{'class':'proListItemName'})
for title in items:
num+=1
id = title.a.get("href")[-32:-1]
url = title.a.get("href")
product_name = title.text.strip().rstrip('\n 比較')
print('\033[94m',"product_id: ", '\033[33m', id)
print('\033[94m',"product_url: ", '\033[33m', url)
print('\033[94m',"No.", num, "product_name: ",'\033[33m', product_name,'\033[0m')
img_url = imgurl_get(index_page_list, soup, num, id)
print('\033[94m',"img_url: ", '\033[33m',img_url)
time.sleep(1)
print('\033[36m',">-------------------------------<")
# get img file
try:
get_img(id, img_url, downloadDirectory)
print('\033[31m','\033[42m','\033[1m', "get_img success!!", '\033[0m')
except Exception as e:
print(e)
# make json file
dict.update({id:{
"id":id, "img_url":img_url, "product_name":product_name, "url":url
}})
f = open('%s.json'% filename, 'w')
json.dump(dict, f, indent=4, ensure_ascii=False)
print('\033[31m','\033[42m','\033[1m', "crate json success!!", '\033[0m')
def imgurl_get(index_page_list, soup, num, id):
# get title&url
imgs = soup.find_all('a', href=re.compile(id))
for img in imgs:
num+=1
img_u = img.find("img",alt="")
img_url = img_u['src']
return img_url
def get_img(id, img_url, downDirectory):
import subprocess
cmd = "wget -O ./downloads/%s.jpg %s" %(id, img_url)
subprocess.call( cmd.strip().split(" ") )
def json2csv():
with open('%s.json'% filename, 'r') as f :
data_json = json.load(f)
json_pd = pd.DataFrame(data_json)
json_pd_t = json_pd.T
json_pd_t.to_csv("%s.csv"% filename)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment