Last active
September 14, 2022 10:54
-
-
Save y-ookuma/47d1264be232043a8e17b59fd011ae56 to your computer and use it in GitHub Desktop.
jancodeを使ってyahoo APIを商品名と荷姿を取得する。入力ファイルは、jan.txt。出力ファイルは、output.csv。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests,csv,json,time,os,collections | |
from bs4 import BeautifulSoup | |
import mojimoji as mo | |
import pandas as pd | |
client_id = "Your Yahoo client id" | |
inputfile="jan.txt" | |
outputfile="output.csv" | |
def code_to_product_info(code): | |
TARGET_URL = "https://shopping.yahooapis.jp/ShoppingWebService/V3/itemSearch?hits=2&appid={0}&query={1}".format(client_id, code) | |
print(TARGET_URL) | |
# スクレイピング | |
html = requests.get(TARGET_URL) # HTMLを取ってくる | |
htmljson = json.loads(html.text) | |
product_name="" #商品名 | |
product_volume="" #荷姿等 | |
product_price="" #価格 | |
if htmljson["totalResultsAvailable"] > 0 : | |
# 文字の整形 | |
price=htmljson["hits"][0]["priceLabel"]["defaultPrice"] | |
if price is not None: | |
product_price=price | |
name=mo.zen_to_han(htmljson["hits"][0]["name"]) #全角to半角 | |
name=name.replace('殺菌剤', '') | |
name=name.replace('農薬', '') | |
name=name.replace('除草剤', '') | |
name=name.replace('水稲用除草剤', '') | |
name=name.replace('(送料無料)', '') | |
name=name.replace('アグロカネショウ', '') | |
name=name.replace('【メール便可】', '') | |
name=name.replace('(ウララドライフロアブル)', '') | |
name=name.replace('バイエル', '') | |
name=name.replace('/', ' ') | |
name=name.replace(' ', ' ') | |
name=name.strip() #先頭および末尾から「空白」と「改行」を削除 | |
if ' ' in name: | |
product_name=name.split(" ")[0] | |
product_volume=name.split(" ")[1] | |
else: | |
product_name=name | |
product_volume="" | |
return [code,product_name,product_volume,product_price] | |
#出力ファイルが存在しているなら削除 | |
if os.path.isfile(outputfile): | |
os.remove(outputfile) | |
cols =["No","JANコード","商品名","荷姿","価格","在庫数"] | |
df = pd.DataFrame(index=[], columns=cols) | |
#jancode 読み込み | |
f = open(inputfile, 'r', encoding='UTF-8') | |
janlist = f.readlines() | |
jandict = collections.Counter(janlist) #辞書型 key:JANコード、value:重複数 | |
jancnt=len(jandict) | |
i=0 | |
for jancode, count in jandict.items(): | |
i+=1 | |
jancode=jancode.strip() #先頭および末尾から「空白」と「改行」を削除 | |
if len(jancode) >= 1: #1文字以上の場合処理 | |
print('*********** start product_info ************') | |
print(" 処理件数: ",i,"/",jancnt, " JANコード:",jancode) | |
output1= code_to_product_info(jancode) | |
output= [i] + output1 + [count] | |
print(output) | |
#dfに追加 | |
record = pd.Series(output, index=df.columns) | |
df = df.append(record, ignore_index=True) | |
print('*********** end product_info ************') | |
print("") | |
#時間制限対応 | |
time.sleep(5) | |
# CSVファイルを出力 | |
df.to_csv(outputfile, index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment