hckim1991/carvana_spider.py Secret

## carvana_spider.py

from scrapy import Spider, Request
from carvana.items import CarvanaItem
import re
from math import ceil

class CarvanaSpider(Spider):
    name = "carvana_spider"
    allowed_urls = ['https://www.carvana.com/']
    # start_urls based on average downpayment of $2,500 and monthly payment of less than $500/month
    start_urls = ['https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=']
    # Could potentially expand this to multiple links that segregate by body type

    def parse(self, response):
        page_number = ceil(int(re.findall('\d+', response.xpath('//span[@data-qa="pagination-text"]/text()').extract()[3])[0]) / 20)
        #Generally 20 items per page but not always
        urls = [f'https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=&page={x}' for x in range(1, page_number)]

        for url in urls:
            yield Request(url = url, callback = self.parse_product_page)

    def parse_product_page(self, response):
        products = response.xpath('//section[@data-qa="results-section"]/div[@data-qa="result-tile"]')
        for product in products:
            try:
                year = int(product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[0])
            except:
                year = None
                print('='*50)
                print('No year. Offending url is {response.url}')
                print('='*50)

            try:
                brand = product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[1]
            except:
                brand = None
                print('='*50)
                print('No brand. Offending url is {response.url}')
                print('='*50)

            try:
                model = product.xpath('.//h3[@data-qa="result-tile-model"]/text()').extract_first()
            except:
                model = None
                print('='*50)
                print('No model. Offending url is {response.url}')
                print('='*50)

            try:
                trim = product.xpath('.//h4[@data-qa="vehicle-trim"]/text()').extract_first()
            except:
                trim = None
                print('='*50)
                print('No trim. Offending url is {response.url}')
                print('='*50)

            try:
                miles = int(product.xpath('.//h4[@data-qa="vehicle-mileage"]/text()').extract_first().split()[0].replace(',', ''))
            except:
                miles = None
                print('='*50)
                print('No miles. Offending url is {response.url}')
                print('='*50)

            try:
                price = int(product.xpath('.//span[@property="price"]/text()').extract_first().replace(',', ''))
            except:
                price = None
                print('='*50)
                print('No price. Offending url is {response.url}')
                print('='*50)

            try:
                monthly_pmt = int(re.findall('\d+', product.xpath('.//span[@data-qa="monthly-payment"]/text()').extract_first().split()[1])[0])
            except:
                monthly_pmt = None
                print('='*50)
                print('No montly payment. Offending url is {response.url}')
                print('='*50)

            try:
                shipping = product.xpath('.//div[@data-qa="shipping-cost"]/text()').extract_first()
            except:
                shipping = None
                print('='*50)
                print('No shipping info. Offending url is {response.url}')
                print('='*50)

            item = CarvanaItem()
            item['year'] = year
            item['brand'] = brand
            item['model'] = model
            item['trim'] = trim
            item['miles'] = miles
            item['price'] = price
            item['monthly_pmt'] = monthly_pmt
            item['shipping'] = shipping
            yield item

## data_processing.py
#This is a Jupyter notebook converted to py
#!/usr/bin/env python
# coding: utf-8

# # Creating the Data Frame for Analyses

# In[ ]:


import numpy as np
import pandas as pd
from scipy import stats
import re
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')


# In[ ]:


def extract_shipping(x):
    '''Extract the cost of shipping from the shipping column of webscraped data.
    Free shipping = 0 and cost of shipping otherwise.'''
    if x == 'Free Shipping':
        return 0
    else:
        return int(re.findall('\d+', x)[0])


# In[ ]:


#Read the csv file
carvana_raw = pd.read_csv('carvana.csv')

#Remove rows with no model name
carvana_raw = carvana_raw[~carvana_raw['model'].isna()]

#Create a new column called ticker with single value of CVNA for merging with VRM data later
carvana_raw['ticker'] = 'CVNA'

#Reorder columns and rename year with age
carvana_raw = carvana_raw[['ticker', 'year', 'brand', 'model', 'trim', 'miles', 'price', 'monthly_pmt', 'shipping']]
carvana_raw.columns = carvana_raw.columns.str.replace('year', 'age')

#Apply extract_shipping function to extract shipping cost
carvana_raw['shipping'] = carvana_raw['shipping'].apply(extract_shipping)

#Convert year to age
carvana_raw['age'] = carvana_raw['age'].astype(np.int)
carvana_raw['age'] = abs(carvana_raw['age'] - datetime.datetime.now().year)

#Drop duplicate rows
carvana_clean = carvana_raw[~carvana_raw.duplicated()].iloc[:, :7]

#Change Alfa to Alfa Romeo, Land to Land Rover, Mercedes-Ben to Mercedes-Benz
carvana_clean = carvana_clean.replace(['Alfa', 'Land', 'Mercedes-Ben'], ['Alfa Romeo', 'Land Rover', 'Mercedes-Benz'])

carvana_clean.sample(5)


# In[ ]:


#Read the csv file
vroom_raw = pd.read_csv('./vroom/vroom.csv', header = None)

#Name columns
vroom_raw.columns = ['age', 'brand', 'model', 'trim', 'miles', 'price']

#Remove rows with no model name
vroom_raw = vroom_raw[~vroom_raw['model'].isna()]

#Create a new column called ticker with single value of VRM for merging with CVNA data later
vroom_raw['ticker'] = 'VRM'

#Reorder columns
vroom_raw = vroom_raw[['ticker', 'age', 'brand', 'model', 'trim', 'miles', 'price']]

#Convert year to age
vroom_raw['age'] = vroom_raw['age'].astype(np.int)
vroom_raw['age'] = abs(vroom_raw['age'] - datetime.datetime.now().year)

#Clean up price
vroom_raw['price'] = vroom_raw['price'].apply(lambda s: int(s[1:].replace(',', '')))

#Drop duplicate rows
vroom_clean = vroom_raw[~vroom_raw.duplicated()]

vroom_clean.head(5)


# In[ ]:


#Merge the 2 data frames
clean_df = pd.concat([carvana_clean, vroom_clean])
clean_df.sample(5)


# # Data Visualization

# In[ ]:


#Default settings
sns.set(font_scale=1.25, style = 'dark')


# In[ ]:


#Clean up dataframes and set variables
clean_df_age = clean_df.groupby(['ticker', 'age']).count()['model']
CVNA_inv_35 = sum((clean_df_age['CVNA']/sum(clean_df_age['CVNA']))[3:6])
VRM_inv_35 = sum((clean_df_age['VRM']/sum(clean_df_age['VRM']))[3:6])
CVNA_mean_age = clean_df[clean_df['ticker'] == 'CVNA']['age'].mean()
VRM_mean_age = clean_df[clean_df['ticker'] == 'VRM']['age'].mean()

#Visualization
age_plot = sns.FacetGrid(clean_df, col = 'ticker', height = 6)
age_plot.map(plt.hist, 'age', density = True)
age_plot.set_axis_labels(x_var = 'Age', y_var = 'Count Density')
age_plot.set_titles('{col_name}')
age_plot.axes[0][0].text(6, 0.35, f'Mean age: {CVNA_mean_age:0.1f}\n % 3-5 years old: {CVNA_inv_35:.1%}', size = 15)
age_plot.axes[0][1].text(6, 0.35, f'Mean age: {VRM_mean_age:0.1f}\n % 3-5 years old: {VRM_inv_35:.1%}', size = 15)
plt.subplots_adjust(top=0.8)
age_plot.fig.suptitle('Age Distribution', size = 25);

#t-test
stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['age'], clean_df[clean_df['ticker'] == 'VRM']['age'])


# In[ ]:


#Clean up dataframes and set variables
clean_df_brand = clean_df.groupby(['ticker', 'brand']).count().sort_values('age')[['age']].reset_index()

def normalize_count(df):
    '''Normalize frequency dictionary to % so that CVNA and VRM can be compared apples to apples'''

    total_CVNA = clean_df_brand.groupby('ticker').sum().loc['CVNA']['age']
    total_VRM = clean_df_brand.groupby('ticker').sum().loc['VRM']['age']

    df.loc[df['ticker'] == 'CVNA', 'age'] = df.loc[df['ticker'] == 'CVNA', 'age'] / total_CVNA
    df.loc[df['ticker'] == 'VRM', 'age'] = df.loc[df['ticker'] == 'VRM', 'age'] / total_VRM

    return df

top_10_brands = ['Toyota', 'Chevrolet', 'FIAT', 'Honda', 'Kia', 'Ford', 'Mitsubishi', 'Nissan', 'Subaru', 'Mazda']
clean_df_brand['color'] = [sns.color_palette('tab10')[1] if brand in top_10_brands
                     else sns.color_palette('tab10')[0] for brand in clean_df_brand['brand']]
clean_df_brand = normalize_count(clean_df_brand)

CVNA_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['CVNA'][1]
VRM_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['VRM'][1]

#Visualization
fig = plt.figure(figsize = (12, 8))
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
plt.subplots_adjust(wspace = 0.75)
axes0.barh(clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['brand'],
             clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['age'],
           color = clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['color'])
axes1.barh(clean_df_brand[clean_df_brand['ticker'] == 'VRM']['brand'],
             clean_df_brand[clean_df_brand['ticker'] == 'VRM']['age'],
           color = clean_df_brand[clean_df_brand['ticker'] == 'VRM']['color'])
axes0.set_xlabel('Percentage')
axes0.set_ylabel('Brand')
axes0.xaxis.set_major_formatter(mtick.PercentFormatter(1))
axes0.text(0.02, 'Alfa Romeo', f'Top 10 Brand As % of Inventory: {CVNA_top_10_brands:0.1%}', size = 12)
axes0.set_title('CVNA')
axes1.set_xlabel('Percentage')
axes1.xaxis.set_major_formatter(mtick.PercentFormatter(1))
axes1.text(0.025, 'Land Rover', f'Top 10 Brand As % of Inventory: {VRM_top_10_brands:0.1%}', size = 12)
axes1.set_title('VRM')
fig.suptitle('Brand Distribution (Orange = Top 10 Brands)', size = 25);


# In[ ]:


#Clean up dataframes and set variables
top_10_models = ['Q7', 'Malibu', 'Malibu Limited', 'Pacifica', 'F150 Super Cab', 'F150 Regular Cab',
                 'F150 SuperCrew Ca', 'F-150', 'Accord Hybrid', 'Sonata', 'Rogue', 'Rogue Sport', 'Rogue Select',
                'Highlander', 'GTI', 'Golf GTI']
clean_df_models = clean_df[clean_df['model'].apply(lambda x: True if x in top_10_models else False)]

def model_cleanup(df):
    '''Replace repetitive names with the clean name'''

    df = df.replace('Malibu Limited', 'Malibu')
    df = df.replace(['F150 Super Cab', 'F150 Regular Cab', 'F150 SuperCrew Ca'], 'F-150')
    df = df.replace(['Rogue Sport', 'Rogue Select'], 'Rogue')
    df = df.replace('Golf GTI', 'GTI')

    return df

clean_df_models = model_cleanup(clean_df_models)
clean_df_models = clean_df_models.groupby(['ticker', 'model']).count().sort_values('age')[['age']].reset_index()

total_CVNA = clean_df_models.groupby('ticker').sum()['age']['CVNA']
total_CVNA_pct = total_CVNA / clean_df.groupby('ticker').count()['age']['CVNA']
total_VRM = clean_df_models.groupby('ticker').sum()['age']['VRM']
total_VRM_pct = total_VRM / clean_df.groupby('ticker').count()['age']['VRM']

#Visualization
fig = plt.figure(figsize = (12, 8))
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
plt.subplots_adjust(wspace = 0.75)
axes0.barh(clean_df_models[clean_df_models['ticker'] == 'CVNA']['model'],
             clean_df_models[clean_df_models['ticker'] == 'CVNA']['age'])
axes1.barh(clean_df_models[clean_df_models['ticker'] == 'VRM']['model'],
             clean_df_models[clean_df_models['ticker'] == 'VRM']['age'])
axes0.set_xlabel('Count')
axes0.set_ylabel('Model')
axes0.set_title('CVNA')
axes0.text(40, 'Sonata', f'Top 10 Model Count: {total_CVNA} \n % of Inventory: {total_CVNA_pct:.2%}', size = 13)
axes1.set_xlabel('Count')
axes1.set_title('VRM')
axes1.text(50, 'Rogue', f'Top 10 Model Count: {total_VRM} \n % of Inventory: {total_VRM_pct:.2%}', size = 13)
fig.suptitle('10 Best Used Models', size = 25);


# In[ ]:


#Clean up dataframes and set variables
clean_df['miles/age'] = clean_df['miles'] / clean_df['age']
clean_df.loc[clean_df['age'] == 0, 'miles/age'] = np.nan
CVNA_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['CVNA']
VRM_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['VRM']

#Visualization
fig = plt.figure(figsize = (12, 8))
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
plt.subplots_adjust(wspace = 0.75)
axes0.hist(clean_df[clean_df['ticker'] == 'CVNA']['miles'], density = True)
axes1.hist(clean_df[clean_df['ticker'] == 'VRM']['miles'], density = True)
axes0.set_xlabel('Miles')
axes0.set_ylabel('Miles Density')
axes0.set_title('CVNA')
axes0.text(70000, 1.5e-5, f'Miles per age: \n     {CVNA_miles_per_age:,.0f}', size = 13)
axes1.set_xlabel('Miles')
axes1.set_title('VRM')
axes1.text(70000, 2e-5, f'Miles per age: \n     {VRM_miles_per_age:,.0f}', size = 13)
fig.suptitle('Miles Distribution', size = 25);

#t-test
stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['miles'], clean_df[clean_df['ticker'] == 'VRM']['miles'])


# In[ ]:


#Clean up dataframes and set variables
clean_df_price = clean_df.groupby(['ticker', 'brand']).mean()[['price']].reset_index()
clean_df_price_merged = pd.merge(clean_df_price[clean_df_price['ticker'] == 'CVNA'],
                                 clean_df_price[clean_df_price['ticker'] == 'VRM'], on = 'brand')
clean_df_price_merged['price_diff'] = clean_df_price_merged['price_x'] - clean_df_price_merged['price_y']
clean_df_price_merged = clean_df_price_merged[['brand', 'price_diff']].sort_values('price_diff', ascending = False)
avg_price_diff = clean_df_price_merged['price_diff'].mean()

#Visualization
fig = plt.figure(figsize = (12, 8))
ax = fig.add_axes([0, 0, 0.8, 0.8])
ax.barh(clean_df_price_merged['brand'], clean_df_price_merged['price_diff'])
ax.xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
ax.text(250, 'Audi', f'CVNA cars are ${-avg_price_diff:.0f} cheaper\nthan VRM cars on average.')
ax.set_title('Price Difference for Each Brand (CVNA Less VRM)', size = 25);

#t-test
stats.ttest_1samp(clean_df_price_merged['price_diff'], 0)


# In[ ]:


#Clean up dataframes and set variables
retail_units = pd.read_csv('retail_units.csv', index_col = 0)
retail_units = retail_units.T.reset_index()
retail_units.columns = retail_units.columns.str.replace('index', 'quarter')
retail_units_CVNA = retail_units.iloc[:, 0:2]
retail_units_CVNA.columns = retail_units_CVNA.columns.str.replace('CVNA', 'retail_units')
retail_units_CVNA['Ticker'] = 'CVNA'
retail_units_VRM = retail_units.iloc[:, [0, 2]]
retail_units_VRM.columns = retail_units_VRM.columns.str.replace('VRM', 'retail_units')
retail_units_VRM['Ticker'] = 'VRM'
retail_units = pd.concat([retail_units_CVNA, retail_units_VRM])

#Visualization
plt.figure(figsize = (12, 8))
plt.xticks(rotation = -30)
retail = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units, linewidth = 5)
retail.set(xlabel = 'Quarter', ylabel = 'Retail Units')
retail.text('16Q2', 50000, f'Both CVNA and VRM started in 2013!\nData not available for prior periods.')
plt.title('Trajectory of Retail Sales', size = 25);


# In[ ]:


#Clean up dataframes and set variables
retail_units2 = retail_units[retail_units['quarter'] >= '19Q1']
CVNA_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') &
                          (retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0]
CVNA_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') &
                          (retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0]
VRM_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') &
                         (retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0]
VRM_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') &
                         (retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0]

#Visualization
plt.figure(figsize = (12, 8))
plt.xticks(rotation = -30)
retail2 = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units2, linewidth = 5)
retail2.set(xlabel = 'Quarter', ylabel = 'Retail Units')
retail2.text('19Q2', 30000, f'CVNA grew {CVNA_20Q3 - CVNA_19Q1:,.0f} units ({(CVNA_20Q3/CVNA_19Q1)-1:.1%}) from 19Q1 to 20Q3.             \nVRM grew {VRM_20Q3 - VRM_19Q1:,.0f} units ({(VRM_20Q3/VRM_19Q1)-1:.1%}) from 19Q1 to 20Q3.')
plt.title('Trajectory of Retail Sales', size = 25);


## items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class CarvanaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    year = scrapy.Field()
    brand = scrapy.Field()
    model = scrapy.Field()
    trim = scrapy.Field()
    miles = scrapy.Field()
    price = scrapy.Field()
    monthly_pmt = scrapy.Field()
    shipping = scrapy.Field()

## middlewares.py
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class CarvanaSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class CarvanaDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

## pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from scrapy.exporters import CsvItemExporter

class WriteItemPipeline(object):
    def __init__(self):
        self.filename = 'carvana.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

## settings.py
# Scrapy settings for carvana project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'carvana'

SPIDER_MODULES = ['carvana.spiders']
NEWSPIDER_MODULE = 'carvana.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'carvana.middlewares.CarvanaSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'carvana.middlewares.CarvanaDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'carvana.pipelines.WriteItemPipeline': 300}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

## vroom_csv.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time

driver = webdriver.Chrome(r'C:\Users\hk486\chromedriver.exe')
driver.maximize_window()
driver.get("https://www.vroom.com/cars/?filters=eyJzZWFyY2giOiIiLCJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=")

csv_file = open('vroom.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csv_file)

page = 1

while True:
    try:
        print('='*50)
        print(f'Scarping page {page}')

        wait_product = WebDriverWait(driver, 7.5)
        products = wait_product.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-sm-6 MuiGrid-grid-md-3"]')))

        for product in products:
            product_dict = {}

            driver.execute_script("arguments[0].scrollIntoView();", product)

            year = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[0]
            print(f'year: {year}')

            brand = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1]
            if brand in ['Alfa', 'Land']:
                brand = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1:3])
                print(f'brand: {brand}')
                model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[3:])
                print(f'model: {model}')
            else:
                print(f'brand: {brand}')
                model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[2:])
                print(f'model: {model}')

            trim = product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[1]').text
            print(f'trim: {trim}')

            miles = int(product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[3]').text.split()[0].replace(',', ''))
            print(f'miles: {miles}')

            price = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[2]').text
            print(f'price: {price}')
            print('='*50)

            product_dict['year'] = year
            product_dict['brand'] = brand
            product_dict['model'] = model
            product_dict['trim'] = trim
            product_dict['miles'] = miles
            product_dict['price'] = price

            writer.writerow(product_dict.values())

        wait_next_button = WebDriverWait(driver, 7.5)
        next_button = wait_next_button.until(EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Go to next page"]')))
        next_button.click()
        time.sleep(1)

        print('='*50)
        print('Next button clicked')

        page += 1

    except Exception as e:
        print('='*10)
        print(e)
        csv_file.close()
        driver.close()
        break

	from scrapy import Spider, Request
	from carvana.items import CarvanaItem
	import re
	from math import ceil

	class CarvanaSpider(Spider):
	name = "carvana_spider"
	allowed_urls = ['https://www.carvana.com/']
	# start_urls based on average downpayment of $2,500 and monthly payment of less than $500/month
	start_urls = ['https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=']
	# Could potentially expand this to multiple links that segregate by body type

	def parse(self, response):
	page_number = ceil(int(re.findall('\d+', response.xpath('//span[@data-qa="pagination-text"]/text()').extract()[3])[0]) / 20)
	#Generally 20 items per page but not always
	urls = [f'https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=&page={x}' for x in range(1, page_number)]

	for url in urls:
	yield Request(url = url, callback = self.parse_product_page)

	def parse_product_page(self, response):
	products = response.xpath('//section[@data-qa="results-section"]/div[@data-qa="result-tile"]')
	for product in products:
	try:
	year = int(product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[0])
	except:
	year = None
	print('='*50)
	print('No year. Offending url is {response.url}')
	print('='*50)

	try:
	brand = product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[1]
	except:
	brand = None
	print('='*50)
	print('No brand. Offending url is {response.url}')
	print('='*50)

	try:
	model = product.xpath('.//h3[@data-qa="result-tile-model"]/text()').extract_first()
	except:
	model = None
	print('='*50)
	print('No model. Offending url is {response.url}')
	print('='*50)

	try:
	trim = product.xpath('.//h4[@data-qa="vehicle-trim"]/text()').extract_first()
	except:
	trim = None
	print('='*50)
	print('No trim. Offending url is {response.url}')
	print('='*50)

	try:
	miles = int(product.xpath('.//h4[@data-qa="vehicle-mileage"]/text()').extract_first().split()[0].replace(',', ''))
	except:
	miles = None
	print('='*50)
	print('No miles. Offending url is {response.url}')
	print('='*50)

	try:
	price = int(product.xpath('.//span[@property="price"]/text()').extract_first().replace(',', ''))
	except:
	price = None
	print('='*50)
	print('No price. Offending url is {response.url}')
	print('='*50)

	try:
	monthly_pmt = int(re.findall('\d+', product.xpath('.//span[@data-qa="monthly-payment"]/text()').extract_first().split()[1])[0])
	except:
	monthly_pmt = None
	print('='*50)
	print('No montly payment. Offending url is {response.url}')
	print('='*50)

	try:
	shipping = product.xpath('.//div[@data-qa="shipping-cost"]/text()').extract_first()
	except:
	shipping = None
	print('='*50)
	print('No shipping info. Offending url is {response.url}')
	print('='*50)

	item = CarvanaItem()
	item['year'] = year
	item['brand'] = brand
	item['model'] = model
	item['trim'] = trim
	item['miles'] = miles
	item['price'] = price
	item['monthly_pmt'] = monthly_pmt
	item['shipping'] = shipping
	yield item
	#This is a Jupyter notebook converted to py
	#!/usr/bin/env python
	# coding: utf-8

	# # Creating the Data Frame for Analyses

	# In[ ]:


	import numpy as np
	import pandas as pd
	from scipy import stats
	import re
	import datetime
	import matplotlib.pyplot as plt
	import matplotlib.ticker as mtick
	import seaborn as sns
	get_ipython().run_line_magic('matplotlib', 'inline')


	# In[ ]:


	def extract_shipping(x):
	'''Extract the cost of shipping from the shipping column of webscraped data.
	Free shipping = 0 and cost of shipping otherwise.'''
	if x == 'Free Shipping':
	return 0
	else:
	return int(re.findall('\d+', x)[0])


	# In[ ]:


	#Read the csv file
	carvana_raw = pd.read_csv('carvana.csv')

	#Remove rows with no model name
	carvana_raw = carvana_raw[~carvana_raw['model'].isna()]

	#Create a new column called ticker with single value of CVNA for merging with VRM data later
	carvana_raw['ticker'] = 'CVNA'

	#Reorder columns and rename year with age
	carvana_raw = carvana_raw[['ticker', 'year', 'brand', 'model', 'trim', 'miles', 'price', 'monthly_pmt', 'shipping']]
	carvana_raw.columns = carvana_raw.columns.str.replace('year', 'age')

	#Apply extract_shipping function to extract shipping cost
	carvana_raw['shipping'] = carvana_raw['shipping'].apply(extract_shipping)

	#Convert year to age
	carvana_raw['age'] = carvana_raw['age'].astype(np.int)
	carvana_raw['age'] = abs(carvana_raw['age'] - datetime.datetime.now().year)

	#Drop duplicate rows
	carvana_clean = carvana_raw[~carvana_raw.duplicated()].iloc[:, :7]

	#Change Alfa to Alfa Romeo, Land to Land Rover, Mercedes-Ben to Mercedes-Benz
	carvana_clean = carvana_clean.replace(['Alfa', 'Land', 'Mercedes-Ben'], ['Alfa Romeo', 'Land Rover', 'Mercedes-Benz'])

	carvana_clean.sample(5)


	# In[ ]:


	#Read the csv file
	vroom_raw = pd.read_csv('./vroom/vroom.csv', header = None)

	#Name columns
	vroom_raw.columns = ['age', 'brand', 'model', 'trim', 'miles', 'price']

	#Remove rows with no model name
	vroom_raw = vroom_raw[~vroom_raw['model'].isna()]

	#Create a new column called ticker with single value of VRM for merging with CVNA data later
	vroom_raw['ticker'] = 'VRM'

	#Reorder columns
	vroom_raw = vroom_raw[['ticker', 'age', 'brand', 'model', 'trim', 'miles', 'price']]

	#Convert year to age
	vroom_raw['age'] = vroom_raw['age'].astype(np.int)
	vroom_raw['age'] = abs(vroom_raw['age'] - datetime.datetime.now().year)

	#Clean up price
	vroom_raw['price'] = vroom_raw['price'].apply(lambda s: int(s[1:].replace(',', '')))

	#Drop duplicate rows
	vroom_clean = vroom_raw[~vroom_raw.duplicated()]

	vroom_clean.head(5)


	# In[ ]:


	#Merge the 2 data frames
	clean_df = pd.concat([carvana_clean, vroom_clean])
	clean_df.sample(5)


	# # Data Visualization

	# In[ ]:


	#Default settings
	sns.set(font_scale=1.25, style = 'dark')


	# In[ ]:


	#Clean up dataframes and set variables
	clean_df_age = clean_df.groupby(['ticker', 'age']).count()['model']
	CVNA_inv_35 = sum((clean_df_age['CVNA']/sum(clean_df_age['CVNA']))[3:6])
	VRM_inv_35 = sum((clean_df_age['VRM']/sum(clean_df_age['VRM']))[3:6])
	CVNA_mean_age = clean_df[clean_df['ticker'] == 'CVNA']['age'].mean()
	VRM_mean_age = clean_df[clean_df['ticker'] == 'VRM']['age'].mean()

	#Visualization
	age_plot = sns.FacetGrid(clean_df, col = 'ticker', height = 6)
	age_plot.map(plt.hist, 'age', density = True)
	age_plot.set_axis_labels(x_var = 'Age', y_var = 'Count Density')
	age_plot.set_titles('{col_name}')
	age_plot.axes[0][0].text(6, 0.35, f'Mean age: {CVNA_mean_age:0.1f}\n % 3-5 years old: {CVNA_inv_35:.1%}', size = 15)
	age_plot.axes[0][1].text(6, 0.35, f'Mean age: {VRM_mean_age:0.1f}\n % 3-5 years old: {VRM_inv_35:.1%}', size = 15)
	plt.subplots_adjust(top=0.8)
	age_plot.fig.suptitle('Age Distribution', size = 25);

	#t-test
	stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['age'], clean_df[clean_df['ticker'] == 'VRM']['age'])


	# In[ ]:


	#Clean up dataframes and set variables
	clean_df_brand = clean_df.groupby(['ticker', 'brand']).count().sort_values('age')[['age']].reset_index()

	def normalize_count(df):
	'''Normalize frequency dictionary to % so that CVNA and VRM can be compared apples to apples'''

	total_CVNA = clean_df_brand.groupby('ticker').sum().loc['CVNA']['age']
	total_VRM = clean_df_brand.groupby('ticker').sum().loc['VRM']['age']

	df.loc[df['ticker'] == 'CVNA', 'age'] = df.loc[df['ticker'] == 'CVNA', 'age'] / total_CVNA
	df.loc[df['ticker'] == 'VRM', 'age'] = df.loc[df['ticker'] == 'VRM', 'age'] / total_VRM

	return df

	top_10_brands = ['Toyota', 'Chevrolet', 'FIAT', 'Honda', 'Kia', 'Ford', 'Mitsubishi', 'Nissan', 'Subaru', 'Mazda']
	clean_df_brand['color'] = [sns.color_palette('tab10')[1] if brand in top_10_brands
	else sns.color_palette('tab10')[0] for brand in clean_df_brand['brand']]
	clean_df_brand = normalize_count(clean_df_brand)

	CVNA_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['CVNA'][1]
	VRM_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['VRM'][1]

	#Visualization
	fig = plt.figure(figsize = (12, 8))
	axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
	axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
	plt.subplots_adjust(wspace = 0.75)
	axes0.barh(clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['brand'],
	clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['age'],
	color = clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['color'])
	axes1.barh(clean_df_brand[clean_df_brand['ticker'] == 'VRM']['brand'],
	clean_df_brand[clean_df_brand['ticker'] == 'VRM']['age'],
	color = clean_df_brand[clean_df_brand['ticker'] == 'VRM']['color'])
	axes0.set_xlabel('Percentage')
	axes0.set_ylabel('Brand')
	axes0.xaxis.set_major_formatter(mtick.PercentFormatter(1))
	axes0.text(0.02, 'Alfa Romeo', f'Top 10 Brand As % of Inventory: {CVNA_top_10_brands:0.1%}', size = 12)
	axes0.set_title('CVNA')
	axes1.set_xlabel('Percentage')
	axes1.xaxis.set_major_formatter(mtick.PercentFormatter(1))
	axes1.text(0.025, 'Land Rover', f'Top 10 Brand As % of Inventory: {VRM_top_10_brands:0.1%}', size = 12)
	axes1.set_title('VRM')
	fig.suptitle('Brand Distribution (Orange = Top 10 Brands)', size = 25);


	# In[ ]:


	#Clean up dataframes and set variables
	top_10_models = ['Q7', 'Malibu', 'Malibu Limited', 'Pacifica', 'F150 Super Cab', 'F150 Regular Cab',
	'F150 SuperCrew Ca', 'F-150', 'Accord Hybrid', 'Sonata', 'Rogue', 'Rogue Sport', 'Rogue Select',
	'Highlander', 'GTI', 'Golf GTI']
	clean_df_models = clean_df[clean_df['model'].apply(lambda x: True if x in top_10_models else False)]

	def model_cleanup(df):
	'''Replace repetitive names with the clean name'''

	df = df.replace('Malibu Limited', 'Malibu')
	df = df.replace(['F150 Super Cab', 'F150 Regular Cab', 'F150 SuperCrew Ca'], 'F-150')
	df = df.replace(['Rogue Sport', 'Rogue Select'], 'Rogue')
	df = df.replace('Golf GTI', 'GTI')

	return df

	clean_df_models = model_cleanup(clean_df_models)
	clean_df_models = clean_df_models.groupby(['ticker', 'model']).count().sort_values('age')[['age']].reset_index()

	total_CVNA = clean_df_models.groupby('ticker').sum()['age']['CVNA']
	total_CVNA_pct = total_CVNA / clean_df.groupby('ticker').count()['age']['CVNA']
	total_VRM = clean_df_models.groupby('ticker').sum()['age']['VRM']
	total_VRM_pct = total_VRM / clean_df.groupby('ticker').count()['age']['VRM']

	#Visualization
	fig = plt.figure(figsize = (12, 8))
	axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
	axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
	plt.subplots_adjust(wspace = 0.75)
	axes0.barh(clean_df_models[clean_df_models['ticker'] == 'CVNA']['model'],
	clean_df_models[clean_df_models['ticker'] == 'CVNA']['age'])
	axes1.barh(clean_df_models[clean_df_models['ticker'] == 'VRM']['model'],
	clean_df_models[clean_df_models['ticker'] == 'VRM']['age'])
	axes0.set_xlabel('Count')
	axes0.set_ylabel('Model')
	axes0.set_title('CVNA')
	axes0.text(40, 'Sonata', f'Top 10 Model Count: {total_CVNA} \n % of Inventory: {total_CVNA_pct:.2%}', size = 13)
	axes1.set_xlabel('Count')
	axes1.set_title('VRM')
	axes1.text(50, 'Rogue', f'Top 10 Model Count: {total_VRM} \n % of Inventory: {total_VRM_pct:.2%}', size = 13)
	fig.suptitle('10 Best Used Models', size = 25);


	# In[ ]:


	#Clean up dataframes and set variables
	clean_df['miles/age'] = clean_df['miles'] / clean_df['age']
	clean_df.loc[clean_df['age'] == 0, 'miles/age'] = np.nan
	CVNA_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['CVNA']
	VRM_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['VRM']

	#Visualization
	fig = plt.figure(figsize = (12, 8))
	axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4)
	axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4)
	plt.subplots_adjust(wspace = 0.75)
	axes0.hist(clean_df[clean_df['ticker'] == 'CVNA']['miles'], density = True)
	axes1.hist(clean_df[clean_df['ticker'] == 'VRM']['miles'], density = True)
	axes0.set_xlabel('Miles')
	axes0.set_ylabel('Miles Density')
	axes0.set_title('CVNA')
	axes0.text(70000, 1.5e-5, f'Miles per age: \n {CVNA_miles_per_age:,.0f}', size = 13)
	axes1.set_xlabel('Miles')
	axes1.set_title('VRM')
	axes1.text(70000, 2e-5, f'Miles per age: \n {VRM_miles_per_age:,.0f}', size = 13)
	fig.suptitle('Miles Distribution', size = 25);

	#t-test
	stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['miles'], clean_df[clean_df['ticker'] == 'VRM']['miles'])


	# In[ ]:


	#Clean up dataframes and set variables
	clean_df_price = clean_df.groupby(['ticker', 'brand']).mean()[['price']].reset_index()
	clean_df_price_merged = pd.merge(clean_df_price[clean_df_price['ticker'] == 'CVNA'],
	clean_df_price[clean_df_price['ticker'] == 'VRM'], on = 'brand')
	clean_df_price_merged['price_diff'] = clean_df_price_merged['price_x'] - clean_df_price_merged['price_y']
	clean_df_price_merged = clean_df_price_merged[['brand', 'price_diff']].sort_values('price_diff', ascending = False)
	avg_price_diff = clean_df_price_merged['price_diff'].mean()

	#Visualization
	fig = plt.figure(figsize = (12, 8))
	ax = fig.add_axes([0, 0, 0.8, 0.8])
	ax.barh(clean_df_price_merged['brand'], clean_df_price_merged['price_diff'])
	ax.xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
	ax.text(250, 'Audi', f'CVNA cars are ${-avg_price_diff:.0f} cheaper\nthan VRM cars on average.')
	ax.set_title('Price Difference for Each Brand (CVNA Less VRM)', size = 25);

	#t-test
	stats.ttest_1samp(clean_df_price_merged['price_diff'], 0)


	# In[ ]:


	#Clean up dataframes and set variables
	retail_units = pd.read_csv('retail_units.csv', index_col = 0)
	retail_units = retail_units.T.reset_index()
	retail_units.columns = retail_units.columns.str.replace('index', 'quarter')
	retail_units_CVNA = retail_units.iloc[:, 0:2]
	retail_units_CVNA.columns = retail_units_CVNA.columns.str.replace('CVNA', 'retail_units')
	retail_units_CVNA['Ticker'] = 'CVNA'
	retail_units_VRM = retail_units.iloc[:, [0, 2]]
	retail_units_VRM.columns = retail_units_VRM.columns.str.replace('VRM', 'retail_units')
	retail_units_VRM['Ticker'] = 'VRM'
	retail_units = pd.concat([retail_units_CVNA, retail_units_VRM])

	#Visualization
	plt.figure(figsize = (12, 8))
	plt.xticks(rotation = -30)
	retail = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units, linewidth = 5)
	retail.set(xlabel = 'Quarter', ylabel = 'Retail Units')
	retail.text('16Q2', 50000, f'Both CVNA and VRM started in 2013!\nData not available for prior periods.')
	plt.title('Trajectory of Retail Sales', size = 25);


	# In[ ]:


	#Clean up dataframes and set variables
	retail_units2 = retail_units[retail_units['quarter'] >= '19Q1']
	CVNA_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') &
	(retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0]
	CVNA_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') &
	(retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0]
	VRM_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') &
	(retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0]
	VRM_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') &
	(retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0]

	#Visualization
	plt.figure(figsize = (12, 8))
	plt.xticks(rotation = -30)
	retail2 = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units2, linewidth = 5)
	retail2.set(xlabel = 'Quarter', ylabel = 'Retail Units')
	retail2.text('19Q2', 30000, f'CVNA grew {CVNA_20Q3 - CVNA_19Q1:,.0f} units ({(CVNA_20Q3/CVNA_19Q1)-1:.1%}) from 19Q1 to 20Q3. \nVRM grew {VRM_20Q3 - VRM_19Q1:,.0f} units ({(VRM_20Q3/VRM_19Q1)-1:.1%}) from 19Q1 to 20Q3.')
	plt.title('Trajectory of Retail Sales', size = 25);
	# Define here the models for your scraped items
	#
	# See documentation in:
	# https://docs.scrapy.org/en/latest/topics/items.html

	import scrapy


	class CarvanaItem(scrapy.Item):
	# define the fields for your item here like:
	# name = scrapy.Field()
	year = scrapy.Field()
	brand = scrapy.Field()
	model = scrapy.Field()
	trim = scrapy.Field()
	miles = scrapy.Field()
	price = scrapy.Field()
	monthly_pmt = scrapy.Field()
	shipping = scrapy.Field()
	# Define here the models for your spider middleware
	#
	# See documentation in:
	# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

	from scrapy import signals

	# useful for handling different item types with a single interface
	from itemadapter import is_item, ItemAdapter


	class CarvanaSpiderMiddleware:
	# Not all methods need to be defined. If a method is not defined,
	# scrapy acts as if the spider middleware does not modify the
	# passed objects.

	@classmethod
	def from_crawler(cls, crawler):
	# This method is used by Scrapy to create your spiders.
	s = cls()
	crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
	return s

	def process_spider_input(self, response, spider):
	# Called for each response that goes through the spider
	# middleware and into the spider.

	# Should return None or raise an exception.
	return None

	def process_spider_output(self, response, result, spider):
	# Called with the results returned from the Spider, after
	# it has processed the response.

	# Must return an iterable of Request, or item objects.
	for i in result:
	yield i

	def process_spider_exception(self, response, exception, spider):
	# Called when a spider or process_spider_input() method
	# (from other spider middleware) raises an exception.

	# Should return either None or an iterable of Request or item objects.
	pass

	def process_start_requests(self, start_requests, spider):
	# Called with the start requests of the spider, and works
	# similarly to the process_spider_output() method, except
	# that it doesn’t have a response associated.

	# Must return only requests (not items).
	for r in start_requests:
	yield r

	def spider_opened(self, spider):
	spider.logger.info('Spider opened: %s' % spider.name)


	class CarvanaDownloaderMiddleware:
	# Not all methods need to be defined. If a method is not defined,
	# scrapy acts as if the downloader middleware does not modify the
	# passed objects.

	@classmethod
	def from_crawler(cls, crawler):
	# This method is used by Scrapy to create your spiders.
	s = cls()
	crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
	return s

	def process_request(self, request, spider):
	# Called for each request that goes through the downloader
	# middleware.

	# Must either:
	# - return None: continue processing this request
	# - or return a Response object
	# - or return a Request object
	# - or raise IgnoreRequest: process_exception() methods of
	# installed downloader middleware will be called
	return None

	def process_response(self, request, response, spider):
	# Called with the response returned from the downloader.

	# Must either;
	# - return a Response object
	# - return a Request object
	# - or raise IgnoreRequest
	return response

	def process_exception(self, request, exception, spider):
	# Called when a download handler or a process_request()
	# (from other downloader middleware) raises an exception.

	# Must either:
	# - return None: continue processing this exception
	# - return a Response object: stops process_exception() chain
	# - return a Request object: stops process_exception() chain
	pass

	def spider_opened(self, spider):
	spider.logger.info('Spider opened: %s' % spider.name)
	# Define your item pipelines here
	#
	# Don't forget to add your pipeline to the ITEM_PIPELINES setting
	# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


	# useful for handling different item types with a single interface
	from scrapy.exporters import CsvItemExporter

	class WriteItemPipeline(object):
	def __init__(self):
	self.filename = 'carvana.csv'

	def open_spider(self, spider):
	self.csvfile = open(self.filename, 'wb')
	self.exporter = CsvItemExporter(self.csvfile)
	self.exporter.start_exporting()

	def close_spider(self, spider):
	self.exporter.finish_exporting()
	self.csvfile.close()

	def process_item(self, item, spider):
	self.exporter.export_item(item)
	return item
	# Scrapy settings for carvana project
	#
	# For simplicity, this file contains only settings considered important or
	# commonly used. You can find more settings consulting the documentation:
	#
	# https://docs.scrapy.org/en/latest/topics/settings.html
	# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

	BOT_NAME = 'carvana'

	SPIDER_MODULES = ['carvana.spiders']
	NEWSPIDER_MODULE = 'carvana.spiders'


	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = False

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	#CONCURRENT_REQUESTS = 32

	# Configure a delay for requests for the same website (default: 0)
	# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	DOWNLOAD_DELAY = 2
	# The download delay setting will honor only one of:
	#CONCURRENT_REQUESTS_PER_DOMAIN = 16
	#CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	#COOKIES_ENABLED = False

	# Disable Telnet Console (enabled by default)
	#TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	#DEFAULT_REQUEST_HEADERS = {
	# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	# 'Accept-Language': 'en',
	#}

	# Enable or disable spider middlewares
	# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
	#SPIDER_MIDDLEWARES = {
	# 'carvana.middlewares.CarvanaSpiderMiddleware': 543,
	#}

	# Enable or disable downloader middlewares
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	#DOWNLOADER_MIDDLEWARES = {
	# 'carvana.middlewares.CarvanaDownloaderMiddleware': 543,
	#}

	# Enable or disable extensions
	# See https://docs.scrapy.org/en/latest/topics/extensions.html
	#EXTENSIONS = {
	# 'scrapy.extensions.telnet.TelnetConsole': None,
	#}

	# Configure item pipelines
	# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
	ITEM_PIPELINES = {'carvana.pipelines.WriteItemPipeline': 300}

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
	#AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	#AUTOTHROTTLE_START_DELAY = 5
	# The maximum download delay to be set in case of high latencies
	#AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	#AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	#HTTPCACHE_ENABLED = True
	#HTTPCACHE_EXPIRATION_SECS = 0
	#HTTPCACHE_DIR = 'httpcache'
	#HTTPCACHE_IGNORE_HTTP_CODES = []
	#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	import csv
	import re
	import time

	driver = webdriver.Chrome(r'C:\Users\hk486\chromedriver.exe')
	driver.maximize_window()
	driver.get("https://www.vroom.com/cars/?filters=eyJzZWFyY2giOiIiLCJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=")

	csv_file = open('vroom.csv', 'w', encoding='utf-8', newline='')
	writer = csv.writer(csv_file)

	page = 1

	while True:
	try:
	print('='*50)
	print(f'Scarping page {page}')

	wait_product = WebDriverWait(driver, 7.5)
	products = wait_product.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-sm-6 MuiGrid-grid-md-3"]')))

	for product in products:
	product_dict = {}

	driver.execute_script("arguments[0].scrollIntoView();", product)

	year = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[0]
	print(f'year: {year}')

	brand = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1]
	if brand in ['Alfa', 'Land']:
	brand = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1:3])
	print(f'brand: {brand}')
	model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[3:])
	print(f'model: {model}')
	else:
	print(f'brand: {brand}')
	model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[2:])
	print(f'model: {model}')

	trim = product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[1]').text
	print(f'trim: {trim}')

	miles = int(product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[3]').text.split()[0].replace(',', ''))
	print(f'miles: {miles}')

	price = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[2]').text
	print(f'price: {price}')
	print('='*50)

	product_dict['year'] = year
	product_dict['brand'] = brand
	product_dict['model'] = model
	product_dict['trim'] = trim
	product_dict['miles'] = miles
	product_dict['price'] = price

	writer.writerow(product_dict.values())

	wait_next_button = WebDriverWait(driver, 7.5)
	next_button = wait_next_button.until(EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Go to next page"]')))
	next_button.click()
	time.sleep(1)

	print('='*50)
	print('Next button clicked')

	page += 1

	except Exception as e:
	print('='*10)
	print(e)
	csv_file.close()
	driver.close()
	break