-
-
Save hckim1991/f1f11539d022f4699f1c579a3fea7071 to your computer and use it in GitHub Desktop.
Webscraping project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy import Spider, Request | |
from carvana.items import CarvanaItem | |
import re | |
from math import ceil | |
class CarvanaSpider(Spider): | |
name = "carvana_spider" | |
allowed_urls = ['https://www.carvana.com/'] | |
# start_urls based on average downpayment of $2,500 and monthly payment of less than $500/month | |
start_urls = ['https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0='] | |
# Could potentially expand this to multiple links that segregate by body type | |
def parse(self, response): | |
page_number = ceil(int(re.findall('\d+', response.xpath('//span[@data-qa="pagination-text"]/text()').extract()[3])[0]) / 20) | |
#Generally 20 items per page but not always | |
urls = [f'https://www.carvana.com/cars/filters/?cvnaid=eyJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=&page={x}' for x in range(1, page_number)] | |
for url in urls: | |
yield Request(url = url, callback = self.parse_product_page) | |
def parse_product_page(self, response): | |
products = response.xpath('//section[@data-qa="results-section"]/div[@data-qa="result-tile"]') | |
for product in products: | |
try: | |
year = int(product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[0]) | |
except: | |
year = None | |
print('='*50) | |
print('No year. Offending url is {response.url}') | |
print('='*50) | |
try: | |
brand = product.xpath('.//h3[@data-qa="result-tile-make"]/text()').extract_first().split()[1] | |
except: | |
brand = None | |
print('='*50) | |
print('No brand. Offending url is {response.url}') | |
print('='*50) | |
try: | |
model = product.xpath('.//h3[@data-qa="result-tile-model"]/text()').extract_first() | |
except: | |
model = None | |
print('='*50) | |
print('No model. Offending url is {response.url}') | |
print('='*50) | |
try: | |
trim = product.xpath('.//h4[@data-qa="vehicle-trim"]/text()').extract_first() | |
except: | |
trim = None | |
print('='*50) | |
print('No trim. Offending url is {response.url}') | |
print('='*50) | |
try: | |
miles = int(product.xpath('.//h4[@data-qa="vehicle-mileage"]/text()').extract_first().split()[0].replace(',', '')) | |
except: | |
miles = None | |
print('='*50) | |
print('No miles. Offending url is {response.url}') | |
print('='*50) | |
try: | |
price = int(product.xpath('.//span[@property="price"]/text()').extract_first().replace(',', '')) | |
except: | |
price = None | |
print('='*50) | |
print('No price. Offending url is {response.url}') | |
print('='*50) | |
try: | |
monthly_pmt = int(re.findall('\d+', product.xpath('.//span[@data-qa="monthly-payment"]/text()').extract_first().split()[1])[0]) | |
except: | |
monthly_pmt = None | |
print('='*50) | |
print('No montly payment. Offending url is {response.url}') | |
print('='*50) | |
try: | |
shipping = product.xpath('.//div[@data-qa="shipping-cost"]/text()').extract_first() | |
except: | |
shipping = None | |
print('='*50) | |
print('No shipping info. Offending url is {response.url}') | |
print('='*50) | |
item = CarvanaItem() | |
item['year'] = year | |
item['brand'] = brand | |
item['model'] = model | |
item['trim'] = trim | |
item['miles'] = miles | |
item['price'] = price | |
item['monthly_pmt'] = monthly_pmt | |
item['shipping'] = shipping | |
yield item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This is a Jupyter notebook converted to py | |
#!/usr/bin/env python | |
# coding: utf-8 | |
# # Creating the Data Frame for Analyses | |
# In[ ]: | |
import numpy as np | |
import pandas as pd | |
from scipy import stats | |
import re | |
import datetime | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mtick | |
import seaborn as sns | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
# In[ ]: | |
def extract_shipping(x): | |
'''Extract the cost of shipping from the shipping column of webscraped data. | |
Free shipping = 0 and cost of shipping otherwise.''' | |
if x == 'Free Shipping': | |
return 0 | |
else: | |
return int(re.findall('\d+', x)[0]) | |
# In[ ]: | |
#Read the csv file | |
carvana_raw = pd.read_csv('carvana.csv') | |
#Remove rows with no model name | |
carvana_raw = carvana_raw[~carvana_raw['model'].isna()] | |
#Create a new column called ticker with single value of CVNA for merging with VRM data later | |
carvana_raw['ticker'] = 'CVNA' | |
#Reorder columns and rename year with age | |
carvana_raw = carvana_raw[['ticker', 'year', 'brand', 'model', 'trim', 'miles', 'price', 'monthly_pmt', 'shipping']] | |
carvana_raw.columns = carvana_raw.columns.str.replace('year', 'age') | |
#Apply extract_shipping function to extract shipping cost | |
carvana_raw['shipping'] = carvana_raw['shipping'].apply(extract_shipping) | |
#Convert year to age | |
carvana_raw['age'] = carvana_raw['age'].astype(np.int) | |
carvana_raw['age'] = abs(carvana_raw['age'] - datetime.datetime.now().year) | |
#Drop duplicate rows | |
carvana_clean = carvana_raw[~carvana_raw.duplicated()].iloc[:, :7] | |
#Change Alfa to Alfa Romeo, Land to Land Rover, Mercedes-Ben to Mercedes-Benz | |
carvana_clean = carvana_clean.replace(['Alfa', 'Land', 'Mercedes-Ben'], ['Alfa Romeo', 'Land Rover', 'Mercedes-Benz']) | |
carvana_clean.sample(5) | |
# In[ ]: | |
#Read the csv file | |
vroom_raw = pd.read_csv('./vroom/vroom.csv', header = None) | |
#Name columns | |
vroom_raw.columns = ['age', 'brand', 'model', 'trim', 'miles', 'price'] | |
#Remove rows with no model name | |
vroom_raw = vroom_raw[~vroom_raw['model'].isna()] | |
#Create a new column called ticker with single value of VRM for merging with CVNA data later | |
vroom_raw['ticker'] = 'VRM' | |
#Reorder columns | |
vroom_raw = vroom_raw[['ticker', 'age', 'brand', 'model', 'trim', 'miles', 'price']] | |
#Convert year to age | |
vroom_raw['age'] = vroom_raw['age'].astype(np.int) | |
vroom_raw['age'] = abs(vroom_raw['age'] - datetime.datetime.now().year) | |
#Clean up price | |
vroom_raw['price'] = vroom_raw['price'].apply(lambda s: int(s[1:].replace(',', ''))) | |
#Drop duplicate rows | |
vroom_clean = vroom_raw[~vroom_raw.duplicated()] | |
vroom_clean.head(5) | |
# In[ ]: | |
#Merge the 2 data frames | |
clean_df = pd.concat([carvana_clean, vroom_clean]) | |
clean_df.sample(5) | |
# # Data Visualization | |
# In[ ]: | |
#Default settings | |
sns.set(font_scale=1.25, style = 'dark') | |
# In[ ]: | |
#Clean up dataframes and set variables | |
clean_df_age = clean_df.groupby(['ticker', 'age']).count()['model'] | |
CVNA_inv_35 = sum((clean_df_age['CVNA']/sum(clean_df_age['CVNA']))[3:6]) | |
VRM_inv_35 = sum((clean_df_age['VRM']/sum(clean_df_age['VRM']))[3:6]) | |
CVNA_mean_age = clean_df[clean_df['ticker'] == 'CVNA']['age'].mean() | |
VRM_mean_age = clean_df[clean_df['ticker'] == 'VRM']['age'].mean() | |
#Visualization | |
age_plot = sns.FacetGrid(clean_df, col = 'ticker', height = 6) | |
age_plot.map(plt.hist, 'age', density = True) | |
age_plot.set_axis_labels(x_var = 'Age', y_var = 'Count Density') | |
age_plot.set_titles('{col_name}') | |
age_plot.axes[0][0].text(6, 0.35, f'Mean age: {CVNA_mean_age:0.1f}\n % 3-5 years old: {CVNA_inv_35:.1%}', size = 15) | |
age_plot.axes[0][1].text(6, 0.35, f'Mean age: {VRM_mean_age:0.1f}\n % 3-5 years old: {VRM_inv_35:.1%}', size = 15) | |
plt.subplots_adjust(top=0.8) | |
age_plot.fig.suptitle('Age Distribution', size = 25); | |
#t-test | |
stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['age'], clean_df[clean_df['ticker'] == 'VRM']['age']) | |
# In[ ]: | |
#Clean up dataframes and set variables | |
clean_df_brand = clean_df.groupby(['ticker', 'brand']).count().sort_values('age')[['age']].reset_index() | |
def normalize_count(df): | |
'''Normalize frequency dictionary to % so that CVNA and VRM can be compared apples to apples''' | |
total_CVNA = clean_df_brand.groupby('ticker').sum().loc['CVNA']['age'] | |
total_VRM = clean_df_brand.groupby('ticker').sum().loc['VRM']['age'] | |
df.loc[df['ticker'] == 'CVNA', 'age'] = df.loc[df['ticker'] == 'CVNA', 'age'] / total_CVNA | |
df.loc[df['ticker'] == 'VRM', 'age'] = df.loc[df['ticker'] == 'VRM', 'age'] / total_VRM | |
return df | |
top_10_brands = ['Toyota', 'Chevrolet', 'FIAT', 'Honda', 'Kia', 'Ford', 'Mitsubishi', 'Nissan', 'Subaru', 'Mazda'] | |
clean_df_brand['color'] = [sns.color_palette('tab10')[1] if brand in top_10_brands | |
else sns.color_palette('tab10')[0] for brand in clean_df_brand['brand']] | |
clean_df_brand = normalize_count(clean_df_brand) | |
CVNA_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['CVNA'][1] | |
VRM_top_10_brands = clean_df_brand.groupby(['ticker', 'color']).sum()['age']['VRM'][1] | |
#Visualization | |
fig = plt.figure(figsize = (12, 8)) | |
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4) | |
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4) | |
plt.subplots_adjust(wspace = 0.75) | |
axes0.barh(clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['brand'], | |
clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['age'], | |
color = clean_df_brand[clean_df_brand['ticker'] == 'CVNA']['color']) | |
axes1.barh(clean_df_brand[clean_df_brand['ticker'] == 'VRM']['brand'], | |
clean_df_brand[clean_df_brand['ticker'] == 'VRM']['age'], | |
color = clean_df_brand[clean_df_brand['ticker'] == 'VRM']['color']) | |
axes0.set_xlabel('Percentage') | |
axes0.set_ylabel('Brand') | |
axes0.xaxis.set_major_formatter(mtick.PercentFormatter(1)) | |
axes0.text(0.02, 'Alfa Romeo', f'Top 10 Brand As % of Inventory: {CVNA_top_10_brands:0.1%}', size = 12) | |
axes0.set_title('CVNA') | |
axes1.set_xlabel('Percentage') | |
axes1.xaxis.set_major_formatter(mtick.PercentFormatter(1)) | |
axes1.text(0.025, 'Land Rover', f'Top 10 Brand As % of Inventory: {VRM_top_10_brands:0.1%}', size = 12) | |
axes1.set_title('VRM') | |
fig.suptitle('Brand Distribution (Orange = Top 10 Brands)', size = 25); | |
# In[ ]: | |
#Clean up dataframes and set variables | |
top_10_models = ['Q7', 'Malibu', 'Malibu Limited', 'Pacifica', 'F150 Super Cab', 'F150 Regular Cab', | |
'F150 SuperCrew Ca', 'F-150', 'Accord Hybrid', 'Sonata', 'Rogue', 'Rogue Sport', 'Rogue Select', | |
'Highlander', 'GTI', 'Golf GTI'] | |
clean_df_models = clean_df[clean_df['model'].apply(lambda x: True if x in top_10_models else False)] | |
def model_cleanup(df): | |
'''Replace repetitive names with the clean name''' | |
df = df.replace('Malibu Limited', 'Malibu') | |
df = df.replace(['F150 Super Cab', 'F150 Regular Cab', 'F150 SuperCrew Ca'], 'F-150') | |
df = df.replace(['Rogue Sport', 'Rogue Select'], 'Rogue') | |
df = df.replace('Golf GTI', 'GTI') | |
return df | |
clean_df_models = model_cleanup(clean_df_models) | |
clean_df_models = clean_df_models.groupby(['ticker', 'model']).count().sort_values('age')[['age']].reset_index() | |
total_CVNA = clean_df_models.groupby('ticker').sum()['age']['CVNA'] | |
total_CVNA_pct = total_CVNA / clean_df.groupby('ticker').count()['age']['CVNA'] | |
total_VRM = clean_df_models.groupby('ticker').sum()['age']['VRM'] | |
total_VRM_pct = total_VRM / clean_df.groupby('ticker').count()['age']['VRM'] | |
#Visualization | |
fig = plt.figure(figsize = (12, 8)) | |
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4) | |
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4) | |
plt.subplots_adjust(wspace = 0.75) | |
axes0.barh(clean_df_models[clean_df_models['ticker'] == 'CVNA']['model'], | |
clean_df_models[clean_df_models['ticker'] == 'CVNA']['age']) | |
axes1.barh(clean_df_models[clean_df_models['ticker'] == 'VRM']['model'], | |
clean_df_models[clean_df_models['ticker'] == 'VRM']['age']) | |
axes0.set_xlabel('Count') | |
axes0.set_ylabel('Model') | |
axes0.set_title('CVNA') | |
axes0.text(40, 'Sonata', f'Top 10 Model Count: {total_CVNA} \n % of Inventory: {total_CVNA_pct:.2%}', size = 13) | |
axes1.set_xlabel('Count') | |
axes1.set_title('VRM') | |
axes1.text(50, 'Rogue', f'Top 10 Model Count: {total_VRM} \n % of Inventory: {total_VRM_pct:.2%}', size = 13) | |
fig.suptitle('10 Best Used Models', size = 25); | |
# In[ ]: | |
#Clean up dataframes and set variables | |
clean_df['miles/age'] = clean_df['miles'] / clean_df['age'] | |
clean_df.loc[clean_df['age'] == 0, 'miles/age'] = np.nan | |
CVNA_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['CVNA'] | |
VRM_miles_per_age = clean_df.groupby('ticker').mean()['miles/age']['VRM'] | |
#Visualization | |
fig = plt.figure(figsize = (12, 8)) | |
axes0 = plt.subplot2grid((1, 9), (0, 0), colspan = 4) | |
axes1 = plt.subplot2grid((1, 9), (0, 5), colspan = 4) | |
plt.subplots_adjust(wspace = 0.75) | |
axes0.hist(clean_df[clean_df['ticker'] == 'CVNA']['miles'], density = True) | |
axes1.hist(clean_df[clean_df['ticker'] == 'VRM']['miles'], density = True) | |
axes0.set_xlabel('Miles') | |
axes0.set_ylabel('Miles Density') | |
axes0.set_title('CVNA') | |
axes0.text(70000, 1.5e-5, f'Miles per age: \n {CVNA_miles_per_age:,.0f}', size = 13) | |
axes1.set_xlabel('Miles') | |
axes1.set_title('VRM') | |
axes1.text(70000, 2e-5, f'Miles per age: \n {VRM_miles_per_age:,.0f}', size = 13) | |
fig.suptitle('Miles Distribution', size = 25); | |
#t-test | |
stats.ttest_ind(clean_df[clean_df['ticker'] == 'CVNA']['miles'], clean_df[clean_df['ticker'] == 'VRM']['miles']) | |
# In[ ]: | |
#Clean up dataframes and set variables | |
clean_df_price = clean_df.groupby(['ticker', 'brand']).mean()[['price']].reset_index() | |
clean_df_price_merged = pd.merge(clean_df_price[clean_df_price['ticker'] == 'CVNA'], | |
clean_df_price[clean_df_price['ticker'] == 'VRM'], on = 'brand') | |
clean_df_price_merged['price_diff'] = clean_df_price_merged['price_x'] - clean_df_price_merged['price_y'] | |
clean_df_price_merged = clean_df_price_merged[['brand', 'price_diff']].sort_values('price_diff', ascending = False) | |
avg_price_diff = clean_df_price_merged['price_diff'].mean() | |
#Visualization | |
fig = plt.figure(figsize = (12, 8)) | |
ax = fig.add_axes([0, 0, 0.8, 0.8]) | |
ax.barh(clean_df_price_merged['brand'], clean_df_price_merged['price_diff']) | |
ax.xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}')) | |
ax.text(250, 'Audi', f'CVNA cars are ${-avg_price_diff:.0f} cheaper\nthan VRM cars on average.') | |
ax.set_title('Price Difference for Each Brand (CVNA Less VRM)', size = 25); | |
#t-test | |
stats.ttest_1samp(clean_df_price_merged['price_diff'], 0) | |
# In[ ]: | |
#Clean up dataframes and set variables | |
retail_units = pd.read_csv('retail_units.csv', index_col = 0) | |
retail_units = retail_units.T.reset_index() | |
retail_units.columns = retail_units.columns.str.replace('index', 'quarter') | |
retail_units_CVNA = retail_units.iloc[:, 0:2] | |
retail_units_CVNA.columns = retail_units_CVNA.columns.str.replace('CVNA', 'retail_units') | |
retail_units_CVNA['Ticker'] = 'CVNA' | |
retail_units_VRM = retail_units.iloc[:, [0, 2]] | |
retail_units_VRM.columns = retail_units_VRM.columns.str.replace('VRM', 'retail_units') | |
retail_units_VRM['Ticker'] = 'VRM' | |
retail_units = pd.concat([retail_units_CVNA, retail_units_VRM]) | |
#Visualization | |
plt.figure(figsize = (12, 8)) | |
plt.xticks(rotation = -30) | |
retail = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units, linewidth = 5) | |
retail.set(xlabel = 'Quarter', ylabel = 'Retail Units') | |
retail.text('16Q2', 50000, f'Both CVNA and VRM started in 2013!\nData not available for prior periods.') | |
plt.title('Trajectory of Retail Sales', size = 25); | |
# In[ ]: | |
#Clean up dataframes and set variables | |
retail_units2 = retail_units[retail_units['quarter'] >= '19Q1'] | |
CVNA_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') & | |
(retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0] | |
CVNA_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') & | |
(retail_units2['Ticker'] == 'CVNA')]['retail_units'].values[0] | |
VRM_20Q3 = retail_units2[(retail_units2['quarter'] == '20Q3') & | |
(retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0] | |
VRM_19Q1 = retail_units2[(retail_units2['quarter'] == '19Q1') & | |
(retail_units2['Ticker'] == 'VRM')]['retail_units'].values[0] | |
#Visualization | |
plt.figure(figsize = (12, 8)) | |
plt.xticks(rotation = -30) | |
retail2 = sns.lineplot(x = 'quarter', y = 'retail_units', hue = 'Ticker', data = retail_units2, linewidth = 5) | |
retail2.set(xlabel = 'Quarter', ylabel = 'Retail Units') | |
retail2.text('19Q2', 30000, f'CVNA grew {CVNA_20Q3 - CVNA_19Q1:,.0f} units ({(CVNA_20Q3/CVNA_19Q1)-1:.1%}) from 19Q1 to 20Q3. \nVRM grew {VRM_20Q3 - VRM_19Q1:,.0f} units ({(VRM_20Q3/VRM_19Q1)-1:.1%}) from 19Q1 to 20Q3.') | |
plt.title('Trajectory of Retail Sales', size = 25); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# https://docs.scrapy.org/en/latest/topics/items.html | |
import scrapy | |
class CarvanaItem(scrapy.Item): | |
# define the fields for your item here like: | |
# name = scrapy.Field() | |
year = scrapy.Field() | |
brand = scrapy.Field() | |
model = scrapy.Field() | |
trim = scrapy.Field() | |
miles = scrapy.Field() | |
price = scrapy.Field() | |
monthly_pmt = scrapy.Field() | |
shipping = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your spider middleware | |
# | |
# See documentation in: | |
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
from scrapy import signals | |
# useful for handling different item types with a single interface | |
from itemadapter import is_item, ItemAdapter | |
class CarvanaSpiderMiddleware: | |
# Not all methods need to be defined. If a method is not defined, | |
# scrapy acts as if the spider middleware does not modify the | |
# passed objects. | |
@classmethod | |
def from_crawler(cls, crawler): | |
# This method is used by Scrapy to create your spiders. | |
s = cls() | |
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | |
return s | |
def process_spider_input(self, response, spider): | |
# Called for each response that goes through the spider | |
# middleware and into the spider. | |
# Should return None or raise an exception. | |
return None | |
def process_spider_output(self, response, result, spider): | |
# Called with the results returned from the Spider, after | |
# it has processed the response. | |
# Must return an iterable of Request, or item objects. | |
for i in result: | |
yield i | |
def process_spider_exception(self, response, exception, spider): | |
# Called when a spider or process_spider_input() method | |
# (from other spider middleware) raises an exception. | |
# Should return either None or an iterable of Request or item objects. | |
pass | |
def process_start_requests(self, start_requests, spider): | |
# Called with the start requests of the spider, and works | |
# similarly to the process_spider_output() method, except | |
# that it doesn’t have a response associated. | |
# Must return only requests (not items). | |
for r in start_requests: | |
yield r | |
def spider_opened(self, spider): | |
spider.logger.info('Spider opened: %s' % spider.name) | |
class CarvanaDownloaderMiddleware: | |
# Not all methods need to be defined. If a method is not defined, | |
# scrapy acts as if the downloader middleware does not modify the | |
# passed objects. | |
@classmethod | |
def from_crawler(cls, crawler): | |
# This method is used by Scrapy to create your spiders. | |
s = cls() | |
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | |
return s | |
def process_request(self, request, spider): | |
# Called for each request that goes through the downloader | |
# middleware. | |
# Must either: | |
# - return None: continue processing this request | |
# - or return a Response object | |
# - or return a Request object | |
# - or raise IgnoreRequest: process_exception() methods of | |
# installed downloader middleware will be called | |
return None | |
def process_response(self, request, response, spider): | |
# Called with the response returned from the downloader. | |
# Must either; | |
# - return a Response object | |
# - return a Request object | |
# - or raise IgnoreRequest | |
return response | |
def process_exception(self, request, exception, spider): | |
# Called when a download handler or a process_request() | |
# (from other downloader middleware) raises an exception. | |
# Must either: | |
# - return None: continue processing this exception | |
# - return a Response object: stops process_exception() chain | |
# - return a Request object: stops process_exception() chain | |
pass | |
def spider_opened(self, spider): | |
spider.logger.info('Spider opened: %s' % spider.name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define your item pipelines here | |
# | |
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | |
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html | |
# useful for handling different item types with a single interface | |
from scrapy.exporters import CsvItemExporter | |
class WriteItemPipeline(object): | |
def __init__(self): | |
self.filename = 'carvana.csv' | |
def open_spider(self, spider): | |
self.csvfile = open(self.filename, 'wb') | |
self.exporter = CsvItemExporter(self.csvfile) | |
self.exporter.start_exporting() | |
def close_spider(self, spider): | |
self.exporter.finish_exporting() | |
self.csvfile.close() | |
def process_item(self, item, spider): | |
self.exporter.export_item(item) | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrapy settings for carvana project | |
# | |
# For simplicity, this file contains only settings considered important or | |
# commonly used. You can find more settings consulting the documentation: | |
# | |
# https://docs.scrapy.org/en/latest/topics/settings.html | |
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html | |
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
BOT_NAME = 'carvana' | |
SPIDER_MODULES = ['carvana.spiders'] | |
NEWSPIDER_MODULE = 'carvana.spiders' | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = False | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 32 | |
# Configure a delay for requests for the same website (default: 0) | |
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
DOWNLOAD_DELAY = 2 | |
# The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
# Disable cookies (enabled by default) | |
#COOKIES_ENABLED = False | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
#DEFAULT_REQUEST_HEADERS = { | |
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
# 'Accept-Language': 'en', | |
#} | |
# Enable or disable spider middlewares | |
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
#SPIDER_MIDDLEWARES = { | |
# 'carvana.middlewares.CarvanaSpiderMiddleware': 543, | |
#} | |
# Enable or disable downloader middlewares | |
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html | |
#DOWNLOADER_MIDDLEWARES = { | |
# 'carvana.middlewares.CarvanaDownloaderMiddleware': 543, | |
#} | |
# Enable or disable extensions | |
# See https://docs.scrapy.org/en/latest/topics/extensions.html | |
#EXTENSIONS = { | |
# 'scrapy.extensions.telnet.TelnetConsole': None, | |
#} | |
# Configure item pipelines | |
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html | |
ITEM_PIPELINES = {'carvana.pipelines.WriteItemPipeline': 300} | |
# Enable and configure the AutoThrottle extension (disabled by default) | |
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html | |
#AUTOTHROTTLE_ENABLED = True | |
# The initial download delay | |
#AUTOTHROTTLE_START_DELAY = 5 | |
# The maximum download delay to be set in case of high latencies | |
#AUTOTHROTTLE_MAX_DELAY = 60 | |
# The average number of requests Scrapy should be sending in parallel to | |
# each remote server | |
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | |
# Enable showing throttling stats for every response received: | |
#AUTOTHROTTLE_DEBUG = False | |
# Enable and configure HTTP caching (disabled by default) | |
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | |
#HTTPCACHE_ENABLED = True | |
#HTTPCACHE_EXPIRATION_SECS = 0 | |
#HTTPCACHE_DIR = 'httpcache' | |
#HTTPCACHE_IGNORE_HTTP_CODES = [] | |
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
import csv | |
import re | |
import time | |
driver = webdriver.Chrome(r'C:\Users\hk486\chromedriver.exe') | |
driver.maximize_window() | |
driver.get("https://www.vroom.com/cars/?filters=eyJzZWFyY2giOiIiLCJwcmljZSI6eyJtaW4iOjIyMTQ1LCJtYXgiOjI5NTI3fX0=") | |
csv_file = open('vroom.csv', 'w', encoding='utf-8', newline='') | |
writer = csv.writer(csv_file) | |
page = 1 | |
while True: | |
try: | |
print('='*50) | |
print(f'Scarping page {page}') | |
wait_product = WebDriverWait(driver, 7.5) | |
products = wait_product.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-sm-6 MuiGrid-grid-md-3"]'))) | |
for product in products: | |
product_dict = {} | |
driver.execute_script("arguments[0].scrollIntoView();", product) | |
year = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[0] | |
print(f'year: {year}') | |
brand = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1] | |
if brand in ['Alfa', 'Land']: | |
brand = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[1:3]) | |
print(f'brand: {brand}') | |
model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[3:]) | |
print(f'model: {model}') | |
else: | |
print(f'brand: {brand}') | |
model = ' '.join(product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[1]').text.split()[2:]) | |
print(f'model: {model}') | |
trim = product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[1]').text | |
print(f'trim: {trim}') | |
miles = int(product.find_element_by_xpath('./div[1]/a[1]/div[2]/div[1]/p[3]').text.split()[0].replace(',', '')) | |
print(f'miles: {miles}') | |
price = product.find_element_by_xpath('./div[1]/a[1]/div[2]/p[2]').text | |
print(f'price: {price}') | |
print('='*50) | |
product_dict['year'] = year | |
product_dict['brand'] = brand | |
product_dict['model'] = model | |
product_dict['trim'] = trim | |
product_dict['miles'] = miles | |
product_dict['price'] = price | |
writer.writerow(product_dict.values()) | |
wait_next_button = WebDriverWait(driver, 7.5) | |
next_button = wait_next_button.until(EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Go to next page"]'))) | |
next_button.click() | |
time.sleep(1) | |
print('='*50) | |
print('Next button clicked') | |
page += 1 | |
except Exception as e: | |
print('='*10) | |
print(e) | |
csv_file.close() | |
driver.close() | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment