Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Created May 3, 2018 12:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lobstrio/66b1ecb2915403e290443e44ddada13d to your computer and use it in GitHub Desktop.
Save lobstrio/66b1ecb2915403e290443e44ddada13d to your computer and use it in GitHub Desktop.
Scraping SensCritique 100 best books list through Python Selenium
# python package
import csv
import time
import random
import codecs
import sys
# selenium package
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
# format utf-8
reload(sys)
sys.setdefaultencoding('utf-8')
# fonction pause
def pause():
time_break = random.randint(1,2)
return time.sleep(time_break)
# options
options = webdriver.ChromeOptions()
options.add_argument("--kiosk")
capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
driver = webdriver.Chrome(desired_capabilities=capa, chrome_options=options)
wait = WebDriverWait(driver, 30)
pause()
print "Driver 1 ouvert"
# url de depart
senscrit_url = "https://www.senscritique.com/livres/tops/top100-des-top10"
# aller sur senscritique
driver.get(senscrit_url)
images = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, "img.lazy"))
)
print "Connecte a Sens Critique"
# scroll down smoothly
scheight = .0
while scheight < 1.0:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight*%s);" % scheight)
scheight += .2
pause()
# ouvrir csv
with codecs.open('senscritique_liste.csv', 'w') as csvfile:
fieldnames = ['Titre', 'Auteur', 'Date', 'Rang', 'Note']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter="$")
writer.writeheader()
# prendre infos
items = driver.find_elements_by_css_selector("li.elto-item")
# boucle
for item in items:
# scroll smoothly
driver.execute_script(
"arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'nearest'});",
item)
# title
try:
title = item.find_element_by_css_selector("a.elco-anchor").text
except NoSuchElementException:
title = ''
pass
# author
try:
author = item.find_element_by_css_selector("span.elco-baseline-a").text
except NoSuchElementException:
author = ''
pass
# date
try:
date = item.find_element_by_css_selector("span.elco-date").text
date.replace('(', '').replace(')', '')
except NoSuchElementException:
date = ''
pass
# rank
try:
rank = item.find_element_by_css_selector("span.elto-rank-item").text
except NoSuchElementException:
rank = ''
pass
# grade
try:
grade = item.find_element_by_css_selector("a.erra-global").text
except NoSuchElementException:
grade = ''
pass
# write csv
writer.writerow({'Titre': title, 'Auteur': author, 'Date': date, 'Rang': rank, 'Note': grade})
print("-- SUCCESS %s --" % title)
# end
print("Bravo !")
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment