miyachin/scrape_mercari_discount_comments.py

## scrape_mercari_discount_comments.py
import csv
import urllib
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

# 検索用URL。
search_result_url = "https://www.mercari.com/jp/search/?sort_order=price_asc&category_root=8&category_child=1164&brand_name=&brand_id=&size_group=&price_min=10000&price_max=50000&status_all=1&status_on_sale=1&status_trading_sold_out=1&page="
page_index = 1
item_list = []

# ブラウザ準備
browser = webdriver.Chrome("./chromedriver")
browser.set_page_load_timeout(10) # 読み込みタイムアウト設定
try:
    while True:
        print("scraping page " + str(page_index) + " ...")
        browser.get(search_result_url + str(page_index))
        time.sleep(2)
        urls = []
        # URLのリストを作成する
        a_tags = browser.find_elements_by_xpath("//a")
        for a_tag in a_tags:
            url = a_tag.get_attribute("href")
            if "/items/" in url:
                urls.append(url)
        # URLリストに1件ずつアクセスしていく
        for url in urls:
            browser.get(url)
            is_discounted_item = False
            item_name = browser.find_element_by_class_name("item-name").get_attribute("innerText")
            item_price = browser.find_element_by_class_name("item-price").get_attribute("innerText")
            item_price = int(item_price.replace("¥", "").replace(",", "")) # 価格文字列をintに変換
            item_by_btn = browser.find_element_by_class_name("item-buy-btn").get_attribute("innerText")
            sell_status = "SOLD" if item_by_btn == "売り切れました" else "SELL"
            td_tags = browser.find_elements_by_xpath("//td")
            item_category = td_tags[1].get_attribute("innerText")
            item_condition = td_tags[3].get_attribute("innerText")

            comment_list = []
            comments = browser.find_elements_by_xpath("//div[@class='message-body']")

            # コメントを1件ずつ取得
            for comment in comments:
                comment_text = comment.get_attribute("innerText").replace("\n", "")
                comment_list.append(comment_text)
                if "値下" in comment_text or "値引" in comment_text:
                    is_discounted_item = True
            # "値下, 値下"というワードがあれば、csv書き出しのためのListにappendする
            if is_discounted_item:
                item = {
                    "name": item_name,
                    "price": item_price,
                    "comment_list": "\n".join(comment_list),
                    "condition": item_condition,
                    "category": item_category,
                    "status": sell_status,
                    "url": url
                }
                item_list.append(item)
                print(len(item_list))
            if len(item_list) >= 1000 or page_index >= 100:
                break
        if len(item_list) >= 1000 or page_index >= 100:
            break
        page_index += 1
except NoSuchElementException as e:
    print("指定した要素が見つかりませんでした")
except TimeoutException as e:
    print("読み込みがタイムアウトしました")

browser.quit() # ブラウザ終了


with open("data.csv", "w") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(("name", "price", "comments", "condition", "category", "status", "url"))
    for item in item_list:
        writer.writerow([item["name"], item["price"], item["comment_list"], item["condition"], item["category"], item["status"], item["url"]])
	import csv
	import urllib
	import time
	from selenium import webdriver
	from selenium.common.exceptions import NoSuchElementException
	from selenium.common.exceptions import TimeoutException

	# 検索用URL。
	search_result_url = "https://www.mercari.com/jp/search/?sort_order=price_asc&category_root=8&category_child=1164&brand_name=&brand_id=&size_group=&price_min=10000&price_max=50000&status_all=1&status_on_sale=1&status_trading_sold_out=1&page="
	page_index = 1
	item_list = []

	# ブラウザ準備
	browser = webdriver.Chrome("./chromedriver")
	browser.set_page_load_timeout(10) # 読み込みタイムアウト設定
	try:
	while True:
	print("scraping page " + str(page_index) + " ...")
	browser.get(search_result_url + str(page_index))
	time.sleep(2)
	urls = []
	# URLのリストを作成する
	a_tags = browser.find_elements_by_xpath("//a")
	for a_tag in a_tags:
	url = a_tag.get_attribute("href")
	if "/items/" in url:
	urls.append(url)
	# URLリストに1件ずつアクセスしていく
	for url in urls:
	browser.get(url)
	is_discounted_item = False
	item_name = browser.find_element_by_class_name("item-name").get_attribute("innerText")
	item_price = browser.find_element_by_class_name("item-price").get_attribute("innerText")
	item_price = int(item_price.replace("¥", "").replace(",", "")) # 価格文字列をintに変換
	item_by_btn = browser.find_element_by_class_name("item-buy-btn").get_attribute("innerText")
	sell_status = "SOLD" if item_by_btn == "売り切れました" else "SELL"
	td_tags = browser.find_elements_by_xpath("//td")
	item_category = td_tags[1].get_attribute("innerText")
	item_condition = td_tags[3].get_attribute("innerText")

	comment_list = []
	comments = browser.find_elements_by_xpath("//div[@class='message-body']")

	# コメントを1件ずつ取得
	for comment in comments:
	comment_text = comment.get_attribute("innerText").replace("\n", "")
	comment_list.append(comment_text)
	if "値下" in comment_text or "値引" in comment_text:
	is_discounted_item = True
	# "値下, 値下"というワードがあれば、csv書き出しのためのListにappendする
	if is_discounted_item:
	item = {
	"name": item_name,
	"price": item_price,
	"comment_list": "\n".join(comment_list),
	"condition": item_condition,
	"category": item_category,
	"status": sell_status,
	"url": url
	}
	item_list.append(item)
	print(len(item_list))
	if len(item_list) >= 1000 or page_index >= 100:
	break
	if len(item_list) >= 1000 or page_index >= 100:
	break
	page_index += 1
	except NoSuchElementException as e:
	print("指定した要素が見つかりませんでした")
	except TimeoutException as e:
	print("読み込みがタイムアウトしました")

	browser.quit() # ブラウザ終了


	with open("data.csv", "w") as csv_file:
	writer = csv.writer(csv_file)
	writer.writerow(("name", "price", "comments", "condition", "category", "status", "url"))
	for item in item_list:
	writer.writerow([item["name"], item["price"], item["comment_list"], item["condition"], item["category"], item["status"], item["url"]])