Skip to content

Instantly share code, notes, and snippets.

@hbro
Created March 20, 2017 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hbro/deb705eb7c85e5bc30ffe3a0f3d8cbed to your computer and use it in GitHub Desktop.
Save hbro/deb705eb7c85e5bc30ffe3a0f3d8cbed to your computer and use it in GitHub Desktop.
Scraper for www.toyotacertified.be (used Toyota cars search engine).
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import csv
import re
import os.path
import time
import datetime
import smtplib
from email.mime.text import MIMEText
# SETUP
print("Configuring...")
# cars.csv file location
carsfile = "cars.csv"
# set e-mail settings
mailto = ["user1@domain.com","user2@domain.com"]
mailfrom = "scraper@domain.com"
# prepare regexes
reDigits = re.compile(r'\d+')
# interested in models
models = ["Auris","Auris Hybride"]
# csv headers
headers = ["url","subject","mileage","fuelType","buildYear","carrosserieType","color","price","dealerInfo","found"]
# login to smtp server
print("Connecting to mailserver...")
smtpserver = smtplib.SMTP('localhost')
# cache urls from csv file
print("Reading list of known cars in memory...")
carUrls = []
if os.path.isfile(carsfile):
with open(carsfile,"r", newline="\n", encoding="utf-8") as csvfile:
csvreader = csv.DictReader(csvfile,fieldnames=headers,delimiter=",",quotechar='"')
carUrls = [car["url"] for car in csvreader]
# get csv file ready for writing
print("Opening csv file for writing...")
csvfile = open(carsfile,"a", newline="\n", encoding="utf-8")
csvwriter = csv.DictWriter(csvfile,fieldnames=headers,quoting=csv.QUOTE_ALL)
# start browser
print("Starting browser...")
#browser = webdriver.Firefox()
browser = webdriver.PhantomJS()
browser.implicitly_wait(10)
# repeat search for each model
print("Initiating search...")
for model in models:
print("Looking for model {}...".format(model))
# go to Toyota occassie search page
browser.get("http://www.toyotacertified.be/advanced-search")
# populate search fields as desired
print("Populating search fields...")
Select(browser.find_element_by_id("ModelID")).select_by_visible_text(model)
Select(browser.find_element_by_id("MileageTo")).select_by_value("100000")
Select(browser.find_element_by_id("RegistrationYearFrom")).select_by_visible_text("2010")
for selectedFuel in browser.find_elements_by_name("selectedFuels"):
if selectedFuel.get_attribute("value") in ["1","4"]:
selectedFuel.click()
Select(browser.find_element_by_id("PriceTo")).select_by_value("15000")
browser.find_element_by_id("PostalCode").send_keys("2350")
Select(browser.find_element_by_id("MaxDistance")).select_by_value("50")
# submit search
print("Submitting search...")
browser.find_element_by_xpath("//div[@class='submitButton']/input").click()
# go over each page
hasNextPage = True
while hasNextPage:
cars = browser.find_elements_by_class_name("ulCarItem")
print("Found {} cars!".format(len(cars)))
for car in cars:
carUrl = car.find_element_by_tag_name("a").get_attribute("href")
if not carUrl in carUrls:
print("New car!")
# collect car data
carData = dict()
carData["url"] = carUrl
carData["subject"] = " ".join([car.find_element_by_class_name("liCarName").text,car.find_element_by_xpath("li[@class='liCarType']/div[@class='description']").text])
carData["mileage"] = "".join(reDigits.findall(car.find_element_by_class_name("liMileage").text))
carData["fuelType"] = car.find_element_by_class_name("liFuelType").text
carData["buildYear"] = car.find_element_by_xpath("li[@class='liConstructionYear']/label[@class='year']").text
carData["carrosserieType"] = car.find_element_by_class_name("liCarrosserieType").text
carData["color"] = car.find_element_by_class_name("liColor").text
carData["price"] = "".join(reDigits.findall(car.find_element_by_class_name("liPrice").text))
carData["dealerInfo"] = car.find_element_by_class_name("dealerInfo").get_attribute('textContent').strip()
carData["found"] = datetime.datetime.now().isoformat()
# report to user
print("\n".join([" "+": ".join(i).capitalize() for i in carData.items()]))
# save in csv
print("Storing in csv file...")
csvwriter.writerow(carData)
# send an e-mail alert
print("Sending email...")
email = MIMEText("\n".join([": ".join(i).capitalize() for i in carData.items()]))
email["Subject"] = carData["subject"]
email["From"] = mailfrom
email["To"] = ", ".join(mailto)
smtpserver.sendmail(mailfrom,mailto,email.as_string())
# open next page
print("Looking to see if there is a next page...")
nextPageElem = browser.find_element_by_class_name("nextPage")
if nextPageElem.get_attribute("disabled"):
print("No next page; Done searching for model {}".format(model))
hasNextPage = False
else:
print("Browsing to next page of search results...")
nextPageElem.click()
time.sleep(2)
# for a new search, go back to the first page of search results (Toyota site is bugged)
print("Returning to first page of search results (Toyota search engine bug)...")
for firstPageElem in browser.find_elements_by_xpath("//a[@pageindex='1']"):
if firstPageElem.is_displayed():
firstPageElem.click()
break
time.sleep(2)
print("Done searching; Closing open handles...")
# close open handles / browser
csvfile.close()
smtpserver.close()
browser.close()
browser.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment