Skip to content

Instantly share code, notes, and snippets.

@newtykip
Created February 14, 2023 03:15
Show Gist options
  • Save newtykip/12d920c4429f11488fab82c5a8e05516 to your computer and use it in GitHub Desktop.
Save newtykip/12d920c4429f11488fab82c5a8e05516 to your computer and use it in GitHub Desktop.
Selenium scraper that can fetch information about books given a list of Amazon URLs
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from xlsxwriter import Workbook
from time import sleep
# Fetch all of the URLs
urls = []
errors = []
with open('input.txt', 'r') as f:
for line in f.readlines():
urls.append(line.strip())
# Instantiate the browser and workbook
options = Options()
options.add_argument('--log-level=3')
browser = Chrome(options=options)
workbook = Workbook('output.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write('A1', 'Author')
worksheet.write('B1', 'Title')
worksheet.write('C1', 'ISBN-13')
worksheet.write('D1', 'Price')
worksheet.write('F1', 'URL')
# Go to the page
i = 1
for url in urls:
try:
print(f'Processing {url} ({i}/{len(urls)})')
browser.get(url)
sleep(2)
# Find the author
author = browser.find_element(By.CLASS_NAME, 'author').find_element(By.TAG_NAME, 'a').text.split(' ')
author.reverse()
author = ' '.join(author)
# Find the title
title = browser.find_element(By.ID, 'productTitle').text
# Find the price
price = browser.find_element(By.CLASS_NAME, 'a-button-selected').find_element(By.CLASS_NAME, 'a-color-price').text
# Find the ISBN
details = list(BeautifulSoup(browser.find_element(By.XPATH, '//*[contains(text(), \'ISBN-13\')]').find_element(By.XPATH, '../..').get_attribute('innerHTML'), features='html.parser').children)
isbn = list(details[len(details) - 2].children)
isbn = isbn[1].decode_contents()
# TODO: get shortened url
# Write this information to the spreadsheet
i += 1
worksheet.write(f'A{i}', author)
worksheet.write(f'B{i}', title)
worksheet.write(f'C{i}', isbn)
worksheet.write(f'D{i}', price)
worksheet.write(f'F{i}', url)
print(author, title, isbn, price)
except:
errors.append(url)
# Wrap it up
worksheet.autofit()
workbook.close()
with open('errors.txt', 'w') as f:
f.writelines(map(lambda x: f'{x}\n', errors))
print('Done :D')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment