Skip to content

Instantly share code, notes, and snippets.

@hckim1991
Created November 5, 2020 03:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hckim1991/f1f11539d022f4699f1c579a3fea7071 to your computer and use it in GitHub Desktop.
Save hckim1991/f1f11539d022f4699f1c579a3fea7071 to your computer and use it in GitHub Desktop.
Webscraping project
from scrapy import Spider, Request
from carvana.items import CarvanaItem
import re
from math import ceil
class CarvanaSpider(Spider):
name = "carvana_spider"
allowed_urls = ['https://www.carvana.com/']
# start_urls based on average downpayment of $2,500 and monthly payment of less than $500/month
start_urls = ['https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=']
# Could potentially expand this to multiple links that segregate by body type
def parse(self, response):
page_number = ceil(int(re.findall('\d+', response.xpath('//span[@data-qa="pagination-text"]/text()').extract()[3])[0]) / 20)
#Generally 20 items per page but not always
urls = [f'https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=&page={x}' for x in range(1, page_number)]
for url in urls:
yield Request(url = url, callback = self.parse_product_page)
def parse_product_page(self, response):
products = response.xpath('//section[@data-qa="results-section"]/div[@data-qa="result-tile"]')
for product in products:
try:
year = int(product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[0])
except:
year = None
print('='*50)
print('No year. Offending url is {response.url}')
print('='*50)
try:
brand = product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[1]
except:
brand = None
print('='*50)
print('No brand. Offending url is {response.url}')
print('='*50)
try:
model = product.xpath('.//h3[@data-qa="result-tile-model"]/text()').extract_first()
except:
model = None
print('='*50)
print('No model. Offending url is {response.url}')
print('='*50)
try:
trim = product.xpath('.//h4[@data-qa="vehicle-trim"]/text()').extract_first()
except:
trim = None
print('='*50)
print('No trim. Offending url is {response.url}')
print('='*50)
try:
miles = int(product.xpath('.//h4[@data-qa="vehicle-mileage"]/text()').extract_first().split()[0].replace(',', ''))
except:
miles = None
print('='*50)
print('No miles. Offending url is {response.url}')
print('='*50)
try:
price = int(product.xpath('.//span[@property="price"]/text()').extract_first().replace(',', ''))
except:
price = None
print('='*50)
print('No price. Offending url is {response.url}')
print('='*50)
try:
monthly_pmt = int(re.findall('\d+', product.xpath('.//span[@data-qa="monthly-payment"]/text()').extract_first().split()[1])[0])
except:
monthly_pmt = None
print('='*50)
print('No montly payment. Offending url is {response.url}')
print('='*50)
try:
shipping = product.xpath('.//div[@data-qa="shipping-cost"]/text()').extract_first()
except:
shipping = None
print('='*50)
print('No shipping info. Offending url is {response.url}')
print('='*50)
item = CarvanaItem()
item['year'] = year
item['brand'] = brand
item['model'] = model
item['trim'] = trim
item['miles'] = miles
item['price'] = price
item['monthly_pmt'] = monthly_pmt
item['shipping'] = shipping
yield item
#This is a Jupyter notebook converted to py
#!/usr/bin/env python
# coding: utf-8
# # Creating the Data Frame for Analyses
# In[ ]:
import numpy as np
import pandas as pd
from scipy import stats
import re
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
# In[ ]:
def extract_shipping(x):
'''Extract the cost of shipping from the shipping column of webscraped data.
Free shipping = 0 and cost of shipping otherwise.'''
if x == 'Free Shipping':
return 0
else:
return int(re.findall('\d+', x)[0])
# In[ ]:
#Read the csv file
carvana_raw = pd.read_csv('carvana.csv')
#Remove rows with no model name
carvana_raw = carvana_raw[~carvana_raw['model'].isna()]
#Create a new column called ticker with single value of CVNA for merging with VRM data later
carvana_raw['ticker'] = 'CVNA'
#Reorder columns and rename year with age
carvana_raw = carvana_raw[['ticker', 'year', 'brand', 'model', 'trim', 'miles', 'price', 'monthly_pmt', 'shipping']]
carvana_raw.columns = carvana_raw.columns.str.replace('year', 'age')
#Apply extract_shipping function to extract shipping cost
carvana_raw['shipping'] = carvana_raw['shipping'].apply(extract_shipping)
#Convert year to age
carvana_raw['age'] = carvana_raw['age'].astype(np.int)
carvana_raw['age'] = abs(carvana_raw['age'] - datetime.datetime.now().year)
#Drop duplicate rows
carvana_clean = carvana_raw[~carvana_raw.duplicated()].iloc[:, :7]
#Change Alfa to Alfa Romeo, Land to Land Rover, Mercedes-Ben to Mercedes-Benz
carvana_clean = carvana_clean.replace(['Alfa', 'Land', 'Mercedes-Ben'], ['Alfa Romeo', 'Land Rover', 'Mercedes-Benz'])
carvana_clean.sample(5)
# In[ ]:
#Read the csv file
vroom_raw = pd.read_csv('./vroom/vroom.csv', header = None)
#Name columns
vroom_raw.columns = ['age', 'brand', 'model', 'trim', 'miles', 'price']
#Remove rows with no model name
vroom_raw = vroom_raw[~vroom_raw['model'].isna()]
#Create a new column called ticker with single value of VRM for merging with CVNA data later
vroom_raw['ticker'] = 'VRM'
#Reorder columns
vroom_raw = vroom_raw[['ticker', 'age', 'brand', 'model', 'trim', 'miles', 'price']]
#Convert year to age
vroom_raw['age'] = vroom_raw['age'].astype(np.int)
vroom_raw['age'] = abs(vroom_raw['age'] - datetime.datetime.now().year)
#Clean up price
vroom_raw['price'] = vroom_raw['price'].apply(lambda s: int(s[1:].replace(',', '')))
#Drop duplicate rows
vroom_clean = vroom_raw[~vroom_raw.duplicated()]
vroom_clean.head(5)
# In[ ]:
#Merge the 2 data frames
clean_df = pd.concat([carvana_clean, vroom_clean])
clean_df.sample(5)
# # Data Visualization
# In[ ]:
#Default settings
sns.set(font_scale=1.25, style = 'dark')
# In[ ]:
#Clean up dataframes and set variables
clean_df_age = clean_df.groupby(['ticker', 'age']).count()['model']
CVNA_inv_35 = sum((clean_df_age['CVNA']/sum(clean_df_age['CVNA']))[3:6])
VRM_inv_35 = sum((clean_df_age['VRM']/sum(clean_df_age['VRM']))[3:6])
CVNA_mean_age = clean_df[clean_df['ticker'] == 'CVNA']['age'].mean()
VRM_mean_age = clean_df[clean_df['ticker'] == 'VRM']['age'].mean()
#Visualization
age_plot = sns.FacetGrid(clean_df, col = 'ticker', height = 6)
age_plot.map(plt.hist, 'age', density = True)
age_plot.set_axis_labels(x_var = 'Age', y_var = 'Count Density')
age_plot.set_titles('{col_name}')
age_plot.axes[0][0].text(6, 0.35, f'Mean age: {CVNA_mean_age:0.1f}\n % 3-5 years old: {CVNA_inv_35:.1%}', size = 15)
age_plot.axes[0][1].text(6, 0.35, f'Mean age: {VRM_mean_age:0.1f}\n % 3-5 years old: {VRM_inv_35:.1%}', size = 15)
plt.subplots_adjust(top=0.8)
age_plot.fig.suptitle('Age Distribution', size = 25);
#t-test
stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['age'], clean_df[clean_df['ticker'] == 'VRM']['age'])
# In[ ]:
#Clean up dataframes and set variables
clean_df_brand = clean_df.groupby(['ticker', 'brand']).count().sort_values('age')[['age']].reset_index()
def normalize_count(df):
'''Normalize frequency dictionary to % so that CVNA and VRM can be compared apples to apples'''
total_CVNA = clean_df_brand.groupby('ticker').sum().loc['CVNA']['age']
total_VRM = clean_df_brand.groupby('ticker').sum().loc['VRM']['age']
df.loc[df['ticker'] == 'CVNA', 'age'] = df.loc[df['ticker'] == 'CVNA', 'age'] / total_CVNA
df.loc[df['ticker'] == 'VRM', 'age'] = df.loc[df['ticker'] == 'VRM', 'age'] / total_VRM
return df
top_10_brands = ['Toyota', 'Chevrolet', 'FIAT', 'Honda', 'Kia', 'Ford', 'Mitsubishi', 'Nissan', 'Subaru', 'Mazda']
clean_df_brand['color'] = [sns.color_palette('tab10')[1] if brand in top_10_brands
else sns.color_palette('tab10')[0] for brand in clean_df_brand['brand']]
clean_df_brand = normalize_count(clean_df_brand)
CVNA_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['CVNA'][1]
VRM_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['VRM'][1]
#Visualization
fig = plt.figure(figsize = (12, 8))
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
plt.subplots_adjust(wspace = 0.75)
axes0.barh(clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['brand'],
clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['age'],
color = clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['color'])
axes1.barh(clean_df_brand[clean_df_brand['ticker'] == 'VRM']['brand'],
clean_df_brand[clean_df_brand['ticker'] == 'VRM']['age'],
color = clean_df_brand[clean_df_brand['ticker'] == 'VRM']['color'])
axes0.set_xlabel('Percentage')
axes0.set_ylabel('Brand')
axes0.xaxis.set_major_formatter(mtick.PercentFormatter(1))
axes0.text(0.02, 'Alfa Romeo', f'Top 10 Brand As % of Inventory: {CVNA_top_10_brands:0.1%}', size = 12)
axes0.set_title('CVNA')
axes1.set_xlabel('Percentage')
axes1.xaxis.set_major_formatter(mtick.PercentFormatter(1))
axes1.text(0.025, 'Land Rover', f'Top 10 Brand As % of Inventory: {VRM_top_10_brands:0.1%}', size = 12)
axes1.set_title('VRM')
fig.suptitle('Brand Distribution (Orange = Top 10 Brands)', size = 25);
# In[ ]:
#Clean up dataframes and set variables
top_10_models = ['Q7', 'Malibu', 'Malibu Limited', 'Pacifica', 'F150 Super Cab', 'F150 Regular Cab',
'F150 SuperCrew Ca', 'F-150', 'Accord Hybrid', 'Sonata', 'Rogue', 'Rogue Sport', 'Rogue Select',
'Highlander', 'GTI', 'Golf GTI']
clean_df_models = clean_df[clean_df['model'].apply(lambda x: True if x in top_10_models else False)]
def model_cleanup(df):
'''Replace repetitive names with the clean name'''
df = df.replace('Malibu Limited', 'Malibu')
df = df.replace(['F150 Super Cab', 'F150 Regular Cab', 'F150 SuperCrew Ca'], 'F-150')
df = df.replace(['Rogue Sport', 'Rogue Select'], 'Rogue')
df = df.replace('Golf GTI', 'GTI')
return df
clean_df_models = model_cleanup(clean_df_models)
clean_df_models = clean_df_models.groupby(['ticker', 'model']).count().sort_values('age')[['age']].reset_index()
total_CVNA = clean_df_models.groupby('ticker').sum()['age']['CVNA']
total_CVNA_pct = total_CVNA / clean_df.groupby('ticker').count()['age']['CVNA']
total_VRM = clean_df_models.groupby('ticker').sum()['age']['VRM']
total_VRM_pct = total_VRM / clean_df.groupby('ticker').count()['age']['VRM']
#Visualization
fig = plt.figure(figsize = (12, 8))
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
plt.subplots_adjust(wspace = 0.75)
axes0.barh(clean_df_models[clean_df_models['ticker'] == 'CVNA']['model'],
clean_df_models[clean_df_models['ticker'] == 'CVNA']['age'])
axes1.barh(clean_df_models[clean_df_models['ticker'] == 'VRM']['model'],
clean_df_models[clean_df_models['ticker'] == 'VRM']['age'])
axes0.set_xlabel('Count')
axes0.set_ylabel('Model')
axes0.set_title('CVNA')
axes0.text(40, 'Sonata', f'Top 10 Model Count: {total_CVNA} \n % of Inventory: {total_CVNA_pct:.2%}', size = 13)
axes1.set_xlabel('Count')
axes1.set_title('VRM')
axes1.text(50, 'Rogue', f'Top 10 Model Count: {total_VRM} \n % of Inventory: {total_VRM_pct:.2%}', size = 13)
fig.suptitle('10 Best Used Models', size = 25);
# In[ ]:
#Clean up dataframes and set variables
clean_df['miles/age'] = clean_df['miles'] / clean_df['age']
clean_df.loc[clean_df['age'] == 0, 'miles/age'] = np.nan
CVNA_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['CVNA']
VRM_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['VRM']
#Visualization
fig = plt.figure(figsize = (12, 8))
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
plt.subplots_adjust(wspace = 0.75)
axes0.hist(clean_df[clean_df['ticker'] == 'CVNA']['miles'], density = True)
axes1.hist(clean_df[clean_df['ticker'] == 'VRM']['miles'], density = True)
axes0.set_xlabel('Miles')
axes0.set_ylabel('Miles Density')
axes0.set_title('CVNA')
axes0.text(70000, 1.5e-5, f'Miles per age: \n {CVNA_miles_per_age:,.0f}', size = 13)
axes1.set_xlabel('Miles')
axes1.set_title('VRM')
axes1.text(70000, 2e-5, f'Miles per age: \n {VRM_miles_per_age:,.0f}', size = 13)
fig.suptitle('Miles Distribution', size = 25);
#t-test
stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['miles'], clean_df[clean_df['ticker'] == 'VRM']['miles'])
# In[ ]:
#Clean up dataframes and set variables
clean_df_price = clean_df.groupby(['ticker', 'brand']).mean()[['price']].reset_index()
clean_df_price_merged = pd.merge(clean_df_price[clean_df_price['ticker'] == 'CVNA'],
clean_df_price[clean_df_price['ticker'] == 'VRM'], on = 'brand')
clean_df_price_merged['price_diff'] = clean_df_price_merged['price_x'] - clean_df_price_merged['price_y']
clean_df_price_merged = clean_df_price_merged[['brand', 'price_diff']].sort_values('price_diff', ascending = False)
avg_price_diff = clean_df_price_merged['price_diff'].mean()
#Visualization
fig = plt.figure(figsize = (12, 8))
ax = fig.add_axes([0, 0, 0.8, 0.8])
ax.barh(clean_df_price_merged['brand'], clean_df_price_merged['price_diff'])
ax.xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
ax.text(250, 'Audi', f'CVNA cars are ${-avg_price_diff:.0f} cheaper\nthan VRM cars on average.')
ax.set_title('Price Difference for Each Brand (CVNA Less VRM)', size = 25);
#t-test
stats.ttest_1samp(clean_df_price_merged['price_diff'], 0)
# In[ ]:
#Clean up dataframes and set variables
retail_units = pd.read_csv('retail_units.csv', index_col = 0)
retail_units = retail_units.T.reset_index()
retail_units.columns = retail_units.columns.str.replace('index', 'quarter')
retail_units_CVNA = retail_units.iloc[:, 0:2]
retail_units_CVNA.columns = retail_units_CVNA.columns.str.replace('CVNA', 'retail_units')
retail_units_CVNA['Ticker'] = 'CVNA'
retail_units_VRM = retail_units.iloc[:, [0, 2]]
retail_units_VRM.columns = retail_units_VRM.columns.str.replace('VRM', 'retail_units')
retail_units_VRM['Ticker'] = 'VRM'
retail_units = pd.concat([retail_units_CVNA, retail_units_VRM])
#Visualization
plt.figure(figsize = (12, 8))
plt.xticks(rotation = -30)
retail = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units, linewidth = 5)
retail.set(xlabel = 'Quarter', ylabel = 'Retail Units')
retail.text('16Q2', 50000, f'Both CVNA and VRM started in 2013!\nData not available for prior periods.')
plt.title('Trajectory of Retail Sales', size = 25);
# In[ ]:
#Clean up dataframes and set variables
retail_units2 = retail_units[retail_units['quarter'] >= '19Q1']
CVNA_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') &
(retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0]
CVNA_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') &
(retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0]
VRM_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') &
(retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0]
VRM_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') &
(retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0]
#Visualization
plt.figure(figsize = (12, 8))
plt.xticks(rotation = -30)
retail2 = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units2, linewidth = 5)
retail2.set(xlabel = 'Quarter', ylabel = 'Retail Units')
retail2.text('19Q2', 30000, f'CVNA grew {CVNA_20Q3 - CVNA_19Q1:,.0f} units ({(CVNA_20Q3/CVNA_19Q1)-1:.1%}) from 19Q1 to 20Q3. \nVRM grew {VRM_20Q3 - VRM_19Q1:,.0f} units ({(VRM_20Q3/VRM_19Q1)-1:.1%}) from 19Q1 to 20Q3.')
plt.title('Trajectory of Retail Sales', size = 25);
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class CarvanaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
year = scrapy.Field()
brand = scrapy.Field()
model = scrapy.Field()
trim = scrapy.Field()
miles = scrapy.Field()
price = scrapy.Field()
monthly_pmt = scrapy.Field()
shipping = scrapy.Field()
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CarvanaSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class CarvanaDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from scrapy.exporters import CsvItemExporter
class WriteItemPipeline(object):
def __init__(self):
self.filename = 'carvana.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
# Scrapy settings for carvana project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'carvana'
SPIDER_MODULES = ['carvana.spiders']
NEWSPIDER_MODULE = 'carvana.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'carvana.middlewares.CarvanaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'carvana.middlewares.CarvanaDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'carvana.pipelines.WriteItemPipeline': 300}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time
driver = webdriver.Chrome(r'C:\Users\hk486\chromedriver.exe')
driver.maximize_window()
driver.get("https://www.vroom.com/cars/?filters=eyJzZWFyY2giOiIiLCJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=")
csv_file = open('vroom.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csv_file)
page = 1
while True:
try:
print('='*50)
print(f'Scarping page {page}')
wait_product = WebDriverWait(driver, 7.5)
products = wait_product.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-sm-6 MuiGrid-grid-md-3"]')))
for product in products:
product_dict = {}
driver.execute_script("arguments[0].scrollIntoView();", product)
year = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[0]
print(f'year: {year}')
brand = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1]
if brand in ['Alfa', 'Land']:
brand = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1:3])
print(f'brand: {brand}')
model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[3:])
print(f'model: {model}')
else:
print(f'brand: {brand}')
model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[2:])
print(f'model: {model}')
trim = product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[1]').text
print(f'trim: {trim}')
miles = int(product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[3]').text.split()[0].replace(',', ''))
print(f'miles: {miles}')
price = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[2]').text
print(f'price: {price}')
print('='*50)
product_dict['year'] = year
product_dict['brand'] = brand
product_dict['model'] = model
product_dict['trim'] = trim
product_dict['miles'] = miles
product_dict['price'] = price
writer.writerow(product_dict.values())
wait_next_button = WebDriverWait(driver, 7.5)
next_button = wait_next_button.until(EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Go to next page"]')))
next_button.click()
time.sleep(1)
print('='*50)
print('Next button clicked')
page += 1
except Exception as e:
print('='*10)
print(e)
csv_file.close()
driver.close()
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment