Last active
June 19, 2020 04:36
-
-
Save miyachin/8593130e1e383685e49e480c3acdae7b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import urllib | |
import time | |
from selenium import webdriver | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.common.exceptions import TimeoutException | |
# 検索用URL。 | |
search_result_url = "https://www.mercari.com/jp/search/?sort_order=price_asc&category_root=8&category_child=1164&brand_name=&brand_id=&size_group=&price_min=10000&price_max=50000&status_all=1&status_on_sale=1&status_trading_sold_out=1&page=" | |
page_index = 1 | |
item_list = [] | |
# ブラウザ準備 | |
browser = webdriver.Chrome("./chromedriver") | |
browser.set_page_load_timeout(10) # 読み込みタイムアウト設定 | |
try: | |
while True: | |
print("scraping page " + str(page_index) + " ...") | |
browser.get(search_result_url + str(page_index)) | |
time.sleep(2) | |
urls = [] | |
# URLのリストを作成する | |
a_tags = browser.find_elements_by_xpath("//a") | |
for a_tag in a_tags: | |
url = a_tag.get_attribute("href") | |
if "/items/" in url: | |
urls.append(url) | |
# URLリストに1件ずつアクセスしていく | |
for url in urls: | |
browser.get(url) | |
is_discounted_item = False | |
item_name = browser.find_element_by_class_name("item-name").get_attribute("innerText") | |
item_price = browser.find_element_by_class_name("item-price").get_attribute("innerText") | |
item_price = int(item_price.replace("¥", "").replace(",", "")) # 価格文字列をintに変換 | |
item_by_btn = browser.find_element_by_class_name("item-buy-btn").get_attribute("innerText") | |
sell_status = "SOLD" if item_by_btn == "売り切れました" else "SELL" | |
td_tags = browser.find_elements_by_xpath("//td") | |
item_category = td_tags[1].get_attribute("innerText") | |
item_condition = td_tags[3].get_attribute("innerText") | |
comment_list = [] | |
comments = browser.find_elements_by_xpath("//div[@class='message-body']") | |
# コメントを1件ずつ取得 | |
for comment in comments: | |
comment_text = comment.get_attribute("innerText").replace("\n", "") | |
comment_list.append(comment_text) | |
if "値下" in comment_text or "値引" in comment_text: | |
is_discounted_item = True | |
# "値下, 値下"というワードがあれば、csv書き出しのためのListにappendする | |
if is_discounted_item: | |
item = { | |
"name": item_name, | |
"price": item_price, | |
"comment_list": "\n".join(comment_list), | |
"condition": item_condition, | |
"category": item_category, | |
"status": sell_status, | |
"url": url | |
} | |
item_list.append(item) | |
print(len(item_list)) | |
if len(item_list) >= 1000 or page_index >= 100: | |
break | |
if len(item_list) >= 1000 or page_index >= 100: | |
break | |
page_index += 1 | |
except NoSuchElementException as e: | |
print("指定した要素が見つかりませんでした") | |
except TimeoutException as e: | |
print("読み込みがタイムアウトしました") | |
browser.quit() # ブラウザ終了 | |
with open("data.csv", "w") as csv_file: | |
writer = csv.writer(csv_file) | |
writer.writerow(("name", "price", "comments", "condition", "category", "status", "url")) | |
for item in item_list: | |
writer.writerow([item["name"], item["price"], item["comment_list"], item["condition"], item["category"], item["status"], item["url"]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment