Skip to content

Instantly share code, notes, and snippets.

@miyachin
Last active June 19, 2020 04:36
Show Gist options
  • Save miyachin/8593130e1e383685e49e480c3acdae7b to your computer and use it in GitHub Desktop.
Save miyachin/8593130e1e383685e49e480c3acdae7b to your computer and use it in GitHub Desktop.
import csv
import urllib
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
# 検索用URL。
search_result_url = "https://www.mercari.com/jp/search/?sort_order=price_asc&category_root=8&category_child=1164&brand_name=&brand_id=&size_group=&price_min=10000&price_max=50000&status_all=1&status_on_sale=1&status_trading_sold_out=1&page="
page_index = 1
item_list = []
# ブラウザ準備
browser = webdriver.Chrome("./chromedriver")
browser.set_page_load_timeout(10) # 読み込みタイムアウト設定
try:
while True:
print("scraping page " + str(page_index) + " ...")
browser.get(search_result_url + str(page_index))
time.sleep(2)
urls = []
# URLのリストを作成する
a_tags = browser.find_elements_by_xpath("//a")
for a_tag in a_tags:
url = a_tag.get_attribute("href")
if "/items/" in url:
urls.append(url)
# URLリストに1件ずつアクセスしていく
for url in urls:
browser.get(url)
is_discounted_item = False
item_name = browser.find_element_by_class_name("item-name").get_attribute("innerText")
item_price = browser.find_element_by_class_name("item-price").get_attribute("innerText")
item_price = int(item_price.replace("¥", "").replace(",", "")) # 価格文字列をintに変換
item_by_btn = browser.find_element_by_class_name("item-buy-btn").get_attribute("innerText")
sell_status = "SOLD" if item_by_btn == "売り切れました" else "SELL"
td_tags = browser.find_elements_by_xpath("//td")
item_category = td_tags[1].get_attribute("innerText")
item_condition = td_tags[3].get_attribute("innerText")
comment_list = []
comments = browser.find_elements_by_xpath("//div[@class='message-body']")
# コメントを1件ずつ取得
for comment in comments:
comment_text = comment.get_attribute("innerText").replace("\n", "")
comment_list.append(comment_text)
if "値下" in comment_text or "値引" in comment_text:
is_discounted_item = True
# "値下, 値下"というワードがあれば、csv書き出しのためのListにappendする
if is_discounted_item:
item = {
"name": item_name,
"price": item_price,
"comment_list": "\n".join(comment_list),
"condition": item_condition,
"category": item_category,
"status": sell_status,
"url": url
}
item_list.append(item)
print(len(item_list))
if len(item_list) >= 1000 or page_index >= 100:
break
if len(item_list) >= 1000 or page_index >= 100:
break
page_index += 1
except NoSuchElementException as e:
print("指定した要素が見つかりませんでした")
except TimeoutException as e:
print("読み込みがタイムアウトしました")
browser.quit() # ブラウザ終了
with open("data.csv", "w") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(("name", "price", "comments", "condition", "category", "status", "url"))
for item in item_list:
writer.writerow([item["name"], item["price"], item["comment_list"], item["condition"], item["category"], item["status"], item["url"]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment