y-ookuma/JanCode_to_product_info.py

## JanCode_to_product_info.py
import requests,csv,json,time,os,collections
from bs4 import BeautifulSoup
import mojimoji as mo
import pandas as pd

client_id = "Your Yahoo client id"
inputfile="jan.txt"
outputfile="output.csv"

def code_to_product_info(code):

    TARGET_URL = "https://shopping.yahooapis.jp/ShoppingWebService/V3/itemSearch?hits=2&appid={0}&query={1}".format(client_id, code)
    print(TARGET_URL)
    # スクレイピング
    html = requests.get(TARGET_URL)  # HTMLを取ってくる
    htmljson = json.loads(html.text)

    product_name="" #商品名
    product_volume="" #荷姿等
    product_price="" #価格

    if htmljson["totalResultsAvailable"] > 0 :
      # 文字の整形
      price=htmljson["hits"][0]["priceLabel"]["defaultPrice"]
      if price is not None:
        product_price=price
      name=mo.zen_to_han(htmljson["hits"][0]["name"])      #全角to半角
      name=name.replace('殺菌剤', '')
      name=name.replace('農薬', '')
      name=name.replace('除草剤', '')
      name=name.replace('水稲用除草剤', '')
      name=name.replace('(送料無料)', '')
      name=name.replace('ｱｸﾞﾛｶﾈｼｮｳ', '')
      name=name.replace('【ﾒｰﾙ便可】', '')
      name=name.replace('(ｳﾗﾗﾄﾞﾗｲﾌﾛｱﾌﾞﾙ)', '')
      name=name.replace('ﾊﾞｲｴﾙ', '')
      name=name.replace('/', ' ')
      name=name.replace('  ', ' ')
      name=name.strip() #先頭および末尾から「空白」と「改行」を削除
      if ' ' in name:
        product_name=name.split(" ")[0]
        product_volume=name.split(" ")[1]
      else:
        product_name=name
        product_volume=""

    return [code,product_name,product_volume,product_price]

#出力ファイルが存在しているなら削除
if os.path.isfile(outputfile):
  os.remove(outputfile)

cols =["No","JANコード","商品名","荷姿","価格","在庫数"]
df = pd.DataFrame(index=[], columns=cols)

#jancode 読み込み
f = open(inputfile, 'r', encoding='UTF-8')
janlist = f.readlines()
jandict = collections.Counter(janlist) #辞書型 key:JANコード、value:重複数
jancnt=len(jandict)
i=0
for jancode, count in jandict.items():
  i+=1
  jancode=jancode.strip() #先頭および末尾から「空白」と「改行」を削除
  if len(jancode) >= 1:  #1文字以上の場合処理
    print('*********** start product_info ************')
    print(" 処理件数：  ",i,"/",jancnt, "  JANコード:",jancode)
    output1= code_to_product_info(jancode)
    output= [i] + output1 + [count]
    print(output)
    #dfに追加
    record = pd.Series(output, index=df.columns)
    df = df.append(record, ignore_index=True)

    print('*********** end product_info ************')
    print("")
  #時間制限対応
  time.sleep(5)
# CSVファイルを出力
df.to_csv(outputfile, index = False)
	import requests,csv,json,time,os,collections
	from bs4 import BeautifulSoup
	import mojimoji as mo
	import pandas as pd

	client_id = "Your Yahoo client id"
	inputfile="jan.txt"
	outputfile="output.csv"

	def code_to_product_info(code):

	TARGET_URL = "https://shopping.yahooapis.jp/ShoppingWebService/V3/itemSearch?hits=2&appid={0}&query={1}".format(client_id, code)
	print(TARGET_URL)
	# スクレイピング
	html = requests.get(TARGET_URL) # HTMLを取ってくる
	htmljson = json.loads(html.text)

	product_name="" #商品名
	product_volume="" #荷姿等
	product_price="" #価格

	if htmljson["totalResultsAvailable"] > 0 :
	# 文字の整形
	price=htmljson["hits"][0]["priceLabel"]["defaultPrice"]
	if price is not None:
	product_price=price
	name=mo.zen_to_han(htmljson["hits"][0]["name"]) #全角to半角
	name=name.replace('殺菌剤', '')
	name=name.replace('農薬', '')
	name=name.replace('除草剤', '')
	name=name.replace('水稲用除草剤', '')
	name=name.replace('(送料無料)', '')
	name=name.replace('ｱｸﾞﾛｶﾈｼｮｳ', '')
	name=name.replace('【ﾒｰﾙ便可】', '')
	name=name.replace('(ｳﾗﾗﾄﾞﾗｲﾌﾛｱﾌﾞﾙ)', '')
	name=name.replace('ﾊﾞｲｴﾙ', '')
	name=name.replace('/', ' ')
	name=name.replace(' ', ' ')
	name=name.strip() #先頭および末尾から「空白」と「改行」を削除
	if ' ' in name:
	product_name=name.split(" ")[0]
	product_volume=name.split(" ")[1]
	else:
	product_name=name
	product_volume=""

	return [code,product_name,product_volume,product_price]

	#出力ファイルが存在しているなら削除
	if os.path.isfile(outputfile):
	os.remove(outputfile)

	cols =["No","JANコード","商品名","荷姿","価格","在庫数"]
	df = pd.DataFrame(index=[], columns=cols)

	#jancode 読み込み
	f = open(inputfile, 'r', encoding='UTF-8')
	janlist = f.readlines()
	jandict = collections.Counter(janlist) #辞書型 key:JANコード、value:重複数
	jancnt=len(jandict)
	i=0
	for jancode, count in jandict.items():
	i+=1
	jancode=jancode.strip() #先頭および末尾から「空白」と「改行」を削除
	if len(jancode) >= 1: #1文字以上の場合処理
	print('********* start product_info **********')
	print(" 処理件数： ",i,"/",jancnt, " JANコード:",jancode)
	output1= code_to_product_info(jancode)
	output= [i] + output1 + [count]
	print(output)
	#dfに追加
	record = pd.Series(output, index=df.columns)
	df = df.append(record, ignore_index=True)

	print('********* end product_info **********')
	print("")
	#時間制限対応
	time.sleep(5)
	# CSVファイルを出力
	df.to_csv(outputfile, index = False)