Skip to content

Instantly share code, notes, and snippets.

@aydinemre
Last active December 10, 2019 19:49
Show Gist options
  • Save aydinemre/303349e1fdc8d717f6f5543a23dda98a to your computer and use it in GitHub Desktop.
Save aydinemre/303349e1fdc8d717f6f5543a23dda98a to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
pip install scrapy
pip install pandas
scrapy runspider scraper/scraper.py -s LOG_ENABLED=False
"""
import scrapy
from scrapy import Selector
import re
import pandas as pd
SUB_PAGE = "erkek-ayakkabi-bot-10097"
DOMAIN = "www.beymen.com"
URL = 'https://%s/%s/' % (DOMAIN, SUB_PAGE)
product_list = set()
class BeymenSpider(scrapy.Spider):
name = DOMAIN
allowed_domains = [DOMAIN]
start_urls = [
URL
]
df = pd.DataFrame(columns=["product_id", "category_id"])
def add_to_df(self, mapping):
self.df = self.df.append(mapping, ignore_index=True)
def parse_item(self, response):
if response.url in product_list:
return
splitted_url = response.url.split("_")
category_id = splitted_url[-1]
product_id = splitted_url[-2]
product_id_mapping = {"product_id": product_id,
"category_id": category_id}
self.add_to_df(product_id_mapping)
# Run first
def parse(self, response):
# Collect product links on the page.
selector = Selector(response)
for href in selector.xpath("//div[@class='item']/a/@href").extract():
if href.startswith("/p_"):
product_list.add(href)
yield scrapy.Request(
response.urljoin(href),
callback=self.parse_item
)
# Traverse next page.
PAGE_SELECTOR = ".page-link"
for pages in response.css(PAGE_SELECTOR).extract():
if "next" in pages:
regex = re.search(" +data-page=\"(.*?)\"", pages)
next_page_number = regex.group(1)
next_page = "?&page=" + next_page_number
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
def closed(self, reason):
self.df.to_csv('product_category_mapping.csv', index=False)
@aydinemre
Copy link
Author

aydinemre commented Feb 12, 2019

Usage

Install dependencies

$ pip install scrapy

To run program:

$ scrapy runspider scraper/scraper.py -s LOG_ENABLED=False

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment