Skip to content

Instantly share code, notes, and snippets.

@erikgregorywebb
Last active February 19, 2023 23:21
Show Gist options
  • Save erikgregorywebb/fc0a7c38b4cd4f16c3bf6afe786ffa46 to your computer and use it in GitHub Desktop.
Save erikgregorywebb/fc0a7c38b4cd4f16c3bf6afe786ffa46 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
def getListingLinks(link):
# Open the driver
driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
driver.get(link)
# Save the links
listing_links = []
links = driver.find_elements_by_css_selector('.listing-item-link')
for link in links:
listing_links.append(str(link.get_attribute('href')))
driver.close()
return listing_links
def getListingContent(listing_link):
# Open the driver
driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
driver.get(listing_link)
# Collect listing informtion
try:
title = driver.find_element_by_css_selector('.listingDetails-title')
location = driver.find_element_by_css_selector('.listingDetails-location')
price = driver.find_element_by_css_selector('.listingDetails-price')
views = driver.find_element_by_css_selector('.viewsDesktop-viewsNumber')
favorites = driver.find_element_by_css_selector('.viewsDesktop-favoritedNumber')
description = driver.find_element_by_css_selector('.listingDescription-text')
name = driver.find_element_by_css_selector('.listingContactSeller-firstName-value')
# Compile into list
listing = [title.text, location.text, price.text, views.text, favorites.text, description.text, name.text, listing_link]
driver.close()
return listing
except:
print("An error occured.")
driver.close()
def getListings(url):
links = getListingLinks(url)
listings = []
# Loop over each listing link
for i in range(0, 10):
time.sleep(3)
try:
listing = getListingContent(links[i])
listings.append(listing)
except:
print("An error occured:", links[i])
# Create DataFrame, clean variables
df = pd.DataFrame(listings, columns = ['title', 'location', 'price', 'views', 'favorites', 'description', 'name', 'link'])
return df
def cleanLlistings(df):
# Split the location variable into location and days_online
df['location'], df['days_online'] = df['location'].str.split('|', 1).str
# Remove the dollar sign in price
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].str.replace(',', '')
# Convert from string to numeric
df['views'] = pd.to_numeric(df['views'])
df['favorites'] = pd.to_numeric(df['favorites'])
return df
def main(url):
start_time = time.time()
# Process
raw_df = getListings(url)
df = cleanLlistings(raw_df)
# Export
df.to_csv("/Users/erikgregorywebb/Documents/Python/ksl-scrapper/listings.csv", sep = ',')
print("--- %s seconds ---" % round(time.time() - start_time, 2))
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment