erikgregorywebb/ksl-scraper.py

## ksl-scraper.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

def getListingLinks(link):
    # Open the driver
    driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
    driver.get(link)

    # Save the links
    listing_links = []
    links = driver.find_elements_by_css_selector('.listing-item-link')
    for link in links:
        listing_links.append(str(link.get_attribute('href')))

    driver.close()
    return listing_links


def getListingContent(listing_link):
    # Open the driver
    driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
    driver.get(listing_link)

    # Collect listing informtion
    try:
        title = driver.find_element_by_css_selector('.listingDetails-title')
        location = driver.find_element_by_css_selector('.listingDetails-location')
        price = driver.find_element_by_css_selector('.listingDetails-price')
        views = driver.find_element_by_css_selector('.viewsDesktop-viewsNumber')
        favorites = driver.find_element_by_css_selector('.viewsDesktop-favoritedNumber')
        description = driver.find_element_by_css_selector('.listingDescription-text')
        name = driver.find_element_by_css_selector('.listingContactSeller-firstName-value')

        # Compile into list
        listing = [title.text, location.text, price.text, views.text, favorites.text, description.text, name.text, listing_link]
        driver.close()
        return listing
    except:
        print("An error occured.")
        driver.close()

def getListings(url):
    links = getListingLinks(url)
    listings = []

    # Loop over each listing link
    for i in range(0, 10):
        time.sleep(3)
        try:
            listing = getListingContent(links[i])
            listings.append(listing)
        except:
            print("An error occured:", links[i])

    # Create DataFrame, clean variables
    df = pd.DataFrame(listings, columns = ['title', 'location', 'price', 'views', 'favorites', 'description', 'name', 'link'])
    return df

def cleanLlistings(df):
    # Split the location variable into location and days_online
    df['location'], df['days_online'] = df['location'].str.split('|', 1).str

    # Remove the dollar sign in price
    df['price'] = df['price'].str.replace('$', '')
    df['price'] = df['price'].str.replace(',', '')

    # Convert from string to numeric
    df['views'] = pd.to_numeric(df['views'])
    df['favorites'] = pd.to_numeric(df['favorites'])

    return df

def main(url):
    start_time = time.time()

    # Process
    raw_df = getListings(url)
    df = cleanLlistings(raw_df)

    # Export
    df.to_csv("/Users/erikgregorywebb/Documents/Python/ksl-scrapper/listings.csv", sep = ',')

    print("--- %s seconds ---" % round(time.time() - start_time, 2))
    return df
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	import time
	import pandas as pd

	def getListingLinks(link):
	# Open the driver
	driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
	driver.get(link)

	# Save the links
	listing_links = []
	links = driver.find_elements_by_css_selector('.listing-item-link')
	for link in links:
	listing_links.append(str(link.get_attribute('href')))

	driver.close()
	return listing_links


	def getListingContent(listing_link):
	# Open the driver
	driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
	driver.get(listing_link)

	# Collect listing informtion
	try:
	title = driver.find_element_by_css_selector('.listingDetails-title')
	location = driver.find_element_by_css_selector('.listingDetails-location')
	price = driver.find_element_by_css_selector('.listingDetails-price')
	views = driver.find_element_by_css_selector('.viewsDesktop-viewsNumber')
	favorites = driver.find_element_by_css_selector('.viewsDesktop-favoritedNumber')
	description = driver.find_element_by_css_selector('.listingDescription-text')
	name = driver.find_element_by_css_selector('.listingContactSeller-firstName-value')

	# Compile into list
	listing = [title.text, location.text, price.text, views.text, favorites.text, description.text, name.text, listing_link]
	driver.close()
	return listing
	except:
	print("An error occured.")
	driver.close()

	def getListings(url):
	links = getListingLinks(url)
	listings = []

	# Loop over each listing link
	for i in range(0, 10):
	time.sleep(3)
	try:
	listing = getListingContent(links[i])
	listings.append(listing)
	except:
	print("An error occured:", links[i])

	# Create DataFrame, clean variables
	df = pd.DataFrame(listings, columns = ['title', 'location', 'price', 'views', 'favorites', 'description', 'name', 'link'])
	return df

	def cleanLlistings(df):
	# Split the location variable into location and days_online
	df['location'], df['days_online'] = df['location'].str.split('\|', 1).str

	# Remove the dollar sign in price
	df['price'] = df['price'].str.replace('$', '')
	df['price'] = df['price'].str.replace(',', '')

	# Convert from string to numeric
	df['views'] = pd.to_numeric(df['views'])
	df['favorites'] = pd.to_numeric(df['favorites'])

	return df

	def main(url):
	start_time = time.time()

	# Process
	raw_df = getListings(url)
	df = cleanLlistings(raw_df)

	# Export
	df.to_csv("/Users/erikgregorywebb/Documents/Python/ksl-scrapper/listings.csv", sep = ',')

	print("--- %s seconds ---" % round(time.time() - start_time, 2))
	return df