Skip to content

Instantly share code, notes, and snippets.

@adammichaelwood
Created October 12, 2016 13:20
Show Gist options
  • Save adammichaelwood/a7568d31bdf36cdce0a9d8e5a688c5fc to your computer and use it in GitHub Desktop.
Save adammichaelwood/a7568d31bdf36cdce0a9d8e5a688c5fc to your computer and use it in GitHub Desktop.
import sys
import google
import time
import random
from bs4 import BeautifulSoup
import urllib.request
import http.cookiejar
from selenium import webdriver
import subprocess
import blessings
from operator import itemgetter
browser = webdriver.Firefox()
# import Resource
topic = input("Topic: ").replace(" ", "+")
t = blessings.Terminal()
amz_base = "https://www.amazon.com/s/?url=search-alias%3Dstripbooks&field-keywords="
browser.get(amz_base + topic)
change_default_format = input("Default format is paperback. Change? ")
if not change_default_format:
browser.find_element_by_partial_link_text("Paperback").click()
pages_to_search = int(input("Pages to search? (About 10-12 books per page) "))
amz_links = set()
for i in range(pages_to_search):
titles = browser.find_elements_by_class_name("s-access-detail-page")
for title in titles:
title_url = title.get_attribute('href').split('/ref', 1)[0]
print(title_url)
amz_links.add(title_url)
time.sleep(5)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
browser.find_element_by_partial_link_text('Next Page').click()
except:
break
print("Total books links: " + str(len(amz_links)))
books = list()
time.sleep(5)
count = 0
for book_url in amz_links:
count += 1
print(str(count) + " of " + str(len(amz_links)))
book = dict()
print(book_url)
browser.get(book_url)
#book['title'] = browser.find_element_by_id('productTitle').text
#print("Title: " + book['title'])
use = input("Use book? ")
if not use:
continue
book['url'] = book_url
title = browser.find_element_by_id('productTitle').text
print("TITLE: " + title)
truncate_at = input("TRUNCATE: ")
if not truncate_at:
book['title'] = title
else:
book['title'] = title.split(truncate_at, 1)[0]
print("TITLE TO USE: " + book['title'])
authors = browser.find_elements_by_class_name('author')
author_names = list()
for author in authors:
name = author.text
if "(Author)" not in name:
continue
name = name.split(' (Au', 1)[0]
author_names.append(name)
num_of_authors = len(author_names)
author_string = ""
if ( num_of_authors == 1 ):
author_string = author_names[0]
else:
for i in range(len(author_names)):
last_name = author_names[i].split(' ', -1)[-1]
print("Original: " + author_names[i] + "\n"
"Last Name: " + last_name
)
fix_last_name = input("Fix last name: ")
if not fix_last_name:
author_names[i] = last_name
else:
author_names[i] = fix_last_name
if ( num_of_authors == 2):
author_string = author_names[0] + " and " + author_names[1]
if ( num_of_authors == 3):
author_string = author_names[0] + ", " + author_names[1] + ", and " + author_names[2]
if ( num_of_authors > 3):
author_string = author_names[0] + " et al."
print(author_string)
book['author'] = author_string
details_element = browser.find_element_by_xpath("//*[contains(text(), 'Product Details')]")
browser.execute_script("return arguments[0].scrollIntoView();", details_element)
book['year'] = input("Publication year: ")
book['description'] = input("Description: ")
book_string = " - [_{title}_]({url}) ({year}), by {author}, {description}\n".format(**book)
print(t.yellow(book_string))
book['string'] = book_string
books.append(book)
for book in books:
print(book['string'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment