tkota0726/rakuten.py

## rakuten.py
# -*- coding: utf-8 -*-
# !/usr/bin/env/python
# coding: utf-8

from urllib.request import urlretrieve
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
from time import time
import requests, random, datetime, re, os, json
import time
import json
import sys
import pandas as pd
sys.setrecursionlimit(10000)

# 200162 本・雑誌・コミック
mainurl = "http://product.rakuten.co.jp"
downloadDirectory = "./downloads"
index_page_list = []
title_list = []
summary_url_list = []
img_list = []
dict = {}
img = []
l = []
i = 0
num=0

filename = input('choose filename--> ')
pages = int(input('how many pages?--> '))
index_url = mainurl + "/200162/?rdate=0&rev=0&st=1&s=1&p="

def main():
    for i in range(pages):
        index_page = pageget(index_page_list, i)
        print('\033[35m', ">>")
        print('\033[35m', index_page)
        print('\033[35m', "<<")
        try:
            # get soup
            data(index_page)
        except Exception as e:
            print(e)
        # get title&url
        titleurl_get(index_page_list, data(index_page), num)
    # json to csv
    json2csv()
    print('\033[31m','\033[42m','\033[1m', "Create csv [SUCCESS]!!", '\033[0m')


def pageget(index_page_list, i):
    index_page_list = index_url + str(i+1)
    time.sleep(0.5)
    return index_page_list

def data(index_page):
    # urlをsoupに
    result = requests.get(index_page)
    # print(index_page)
    c = result.content
    soup = BeautifulSoup(c,"html.parser")
    return soup

def titleurl_get(index_page_list, soup, num):
    # get title&url
    items = soup.find_all('div',{'class':'proListItemName'})

    for title in items:
        num+=1
        id = title.a.get("href")[-32:-1]
        url = title.a.get("href")
        product_name = title.text.strip().rstrip('\n 比較')
        print('\033[94m',"product_id: ", '\033[33m', id)
        print('\033[94m',"product_url: ", '\033[33m', url)
        print('\033[94m',"No.", num, "product_name: ",'\033[33m', product_name,'\033[0m')
        img_url = imgurl_get(index_page_list, soup, num, id)
        print('\033[94m',"img_url: ", '\033[33m',img_url)
        time.sleep(1)
        print('\033[36m',">-------------------------------<")
        # get img file
        try:
            get_img(id, img_url, downloadDirectory)
            print('\033[31m','\033[42m','\033[1m', "get_img success!!", '\033[0m')
        except Exception as e:
            print(e)
        # make json file
        dict.update({id:{
            "id":id, "img_url":img_url, "product_name":product_name, "url":url
        }})

    f = open('%s.json'% filename, 'w')
    json.dump(dict, f, indent=4, ensure_ascii=False)
    print('\033[31m','\033[42m','\033[1m', "crate json success!!", '\033[0m')


def imgurl_get(index_page_list, soup, num, id):
    # get title&url
    imgs = soup.find_all('a', href=re.compile(id))
    for img in imgs:
        num+=1
        img_u = img.find("img",alt="")
        img_url = img_u['src']
        return img_url

def get_img(id, img_url, downDirectory):
    import subprocess
    cmd = "wget -O ./downloads/%s.jpg %s" %(id, img_url)
    subprocess.call( cmd.strip().split(" ")  )

def json2csv():
    with open('%s.json'% filename, 'r') as f :
        data_json = json.load(f)
    json_pd = pd.DataFrame(data_json)
    json_pd_t = json_pd.T
    json_pd_t.to_csv("%s.csv"% filename)

if __name__ == '__main__':
    main()
	# -- coding: utf-8 --
	# !/usr/bin/env/python
	# coding: utf-8

	from urllib.request import urlretrieve
	from urllib.request import urlopen
	from urllib.error import HTTPError
	from bs4 import BeautifulSoup
	from time import time
	import requests, random, datetime, re, os, json
	import time
	import json
	import sys
	import pandas as pd
	sys.setrecursionlimit(10000)

	# 200162 本・雑誌・コミック
	mainurl = "http://product.rakuten.co.jp"
	downloadDirectory = "./downloads"
	index_page_list = []
	title_list = []
	summary_url_list = []
	img_list = []
	dict = {}
	img = []
	l = []
	i = 0
	num=0

	filename = input('choose filename--> ')
	pages = int(input('how many pages?--> '))
	index_url = mainurl + "/200162/?rdate=0&rev=0&st=1&s=1&p="

	def main():
	for i in range(pages):
	index_page = pageget(index_page_list, i)
	print('\033[35m', ">>")
	print('\033[35m', index_page)
	print('\033[35m', "<<")
	try:
	# get soup
	data(index_page)
	except Exception as e:
	print(e)
	# get title&url
	titleurl_get(index_page_list, data(index_page), num)
	# json to csv
	json2csv()
	print('\033[31m','\033[42m','\033[1m', "Create csv [SUCCESS]!!", '\033[0m')


	def pageget(index_page_list, i):
	index_page_list = index_url + str(i+1)
	time.sleep(0.5)
	return index_page_list

	def data(index_page):
	# urlをsoupに
	result = requests.get(index_page)
	# print(index_page)
	c = result.content
	soup = BeautifulSoup(c,"html.parser")
	return soup

	def titleurl_get(index_page_list, soup, num):
	# get title&url
	items = soup.find_all('div',{'class':'proListItemName'})

	for title in items:
	num+=1
	id = title.a.get("href")[-32:-1]
	url = title.a.get("href")
	product_name = title.text.strip().rstrip('\n 比較')
	print('\033[94m',"product_id: ", '\033[33m', id)
	print('\033[94m',"product_url: ", '\033[33m', url)
	print('\033[94m',"No.", num, "product_name: ",'\033[33m', product_name,'\033[0m')
	img_url = imgurl_get(index_page_list, soup, num, id)
	print('\033[94m',"img_url: ", '\033[33m',img_url)
	time.sleep(1)
	print('\033[36m',">-------------------------------<")
	# get img file
	try:
	get_img(id, img_url, downloadDirectory)
	print('\033[31m','\033[42m','\033[1m', "get_img success!!", '\033[0m')
	except Exception as e:
	print(e)
	# make json file
	dict.update({id:{
	"id":id, "img_url":img_url, "product_name":product_name, "url":url
	}})

	f = open('%s.json'% filename, 'w')
	json.dump(dict, f, indent=4, ensure_ascii=False)
	print('\033[31m','\033[42m','\033[1m', "crate json success!!", '\033[0m')


	def imgurl_get(index_page_list, soup, num, id):
	# get title&url
	imgs = soup.find_all('a', href=re.compile(id))
	for img in imgs:
	num+=1
	img_u = img.find("img",alt="")
	img_url = img_u['src']
	return img_url

	def get_img(id, img_url, downDirectory):
	import subprocess
	cmd = "wget -O ./downloads/%s.jpg %s" %(id, img_url)
	subprocess.call( cmd.strip().split(" ") )

	def json2csv():
	with open('%s.json'% filename, 'r') as f :
	data_json = json.load(f)
	json_pd = pd.DataFrame(data_json)
	json_pd_t = json_pd.T
	json_pd_t.to_csv("%s.csv"% filename)

	if __name__ == '__main__':
	main()