newtykip/scraper.py

## scraper.py
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from xlsxwriter import Workbook
from time import sleep

# Fetch all of the URLs
urls = []
errors = []

with open('input.txt', 'r') as f:
	for line in f.readlines():
		urls.append(line.strip())

# Instantiate the browser and workbook
options = Options()
options.add_argument('--log-level=3')

browser = Chrome(options=options)

workbook = Workbook('output.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write('A1', 'Author')
worksheet.write('B1', 'Title')
worksheet.write('C1', 'ISBN-13')
worksheet.write('D1', 'Price')
worksheet.write('F1', 'URL')

# Go to the page
i = 1

for url in urls:
	try:
		print(f'Processing {url} ({i}/{len(urls)})')
		browser.get(url)
		sleep(2)

		# Find the author
		author = browser.find_element(By.CLASS_NAME, 'author').find_element(By.TAG_NAME, 'a').text.split(' ')
		author.reverse()
		author = ' '.join(author)

		# Find the title
		title = browser.find_element(By.ID, 'productTitle').text

		# Find the price
		price = browser.find_element(By.CLASS_NAME, 'a-button-selected').find_element(By.CLASS_NAME, 'a-color-price').text

		# Find the ISBN
		details = list(BeautifulSoup(browser.find_element(By.XPATH, '//*[contains(text(), \'ISBN-13\')]').find_element(By.XPATH, '../..').get_attribute('innerHTML'), features='html.parser').children)

		isbn = list(details[len(details) - 2].children)
		isbn = isbn[1].decode_contents()

		# TODO: get shortened url

		# Write this information to the spreadsheet
		i += 1

		worksheet.write(f'A{i}', author)
		worksheet.write(f'B{i}', title)
		worksheet.write(f'C{i}', isbn)
		worksheet.write(f'D{i}', price)
		worksheet.write(f'F{i}', url)

		print(author, title, isbn, price)
	except:
		errors.append(url)

# Wrap it up
worksheet.autofit()
workbook.close()

with open('errors.txt', 'w') as f:
	f.writelines(map(lambda x: f'{x}\n', errors))

print('Done :D')
	from selenium.webdriver import Chrome
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from bs4 import BeautifulSoup
	from xlsxwriter import Workbook
	from time import sleep

	# Fetch all of the URLs
	urls = []
	errors = []

	with open('input.txt', 'r') as f:
	for line in f.readlines():
	urls.append(line.strip())

	# Instantiate the browser and workbook
	options = Options()
	options.add_argument('--log-level=3')

	browser = Chrome(options=options)

	workbook = Workbook('output.xlsx')
	worksheet = workbook.add_worksheet()

	worksheet.write('A1', 'Author')
	worksheet.write('B1', 'Title')
	worksheet.write('C1', 'ISBN-13')
	worksheet.write('D1', 'Price')
	worksheet.write('F1', 'URL')

	# Go to the page
	i = 1

	for url in urls:
	try:
	print(f'Processing {url} ({i}/{len(urls)})')
	browser.get(url)
	sleep(2)

	# Find the author
	author = browser.find_element(By.CLASS_NAME, 'author').find_element(By.TAG_NAME, 'a').text.split(' ')
	author.reverse()
	author = ' '.join(author)

	# Find the title
	title = browser.find_element(By.ID, 'productTitle').text

	# Find the price
	price = browser.find_element(By.CLASS_NAME, 'a-button-selected').find_element(By.CLASS_NAME, 'a-color-price').text

	# Find the ISBN
	details = list(BeautifulSoup(browser.find_element(By.XPATH, '//*[contains(text(), \'ISBN-13\')]').find_element(By.XPATH, '../..').get_attribute('innerHTML'), features='html.parser').children)

	isbn = list(details[len(details) - 2].children)
	isbn = isbn[1].decode_contents()

	# TODO: get shortened url

	# Write this information to the spreadsheet
	i += 1

	worksheet.write(f'A{i}', author)
	worksheet.write(f'B{i}', title)
	worksheet.write(f'C{i}', isbn)
	worksheet.write(f'D{i}', price)
	worksheet.write(f'F{i}', url)

	print(author, title, isbn, price)
	except:
	errors.append(url)

	# Wrap it up
	worksheet.autofit()
	workbook.close()

	with open('errors.txt', 'w') as f:
	f.writelines(map(lambda x: f'{x}\n', errors))

	print('Done :D')