Skip to content

Instantly share code, notes, and snippets.

@MarioVilas
Last active March 15, 2023 13:51
Show Gist options
  • Save MarioVilas/de6f088b6fe11f765971411ab226d84f to your computer and use it in GitHub Desktop.
Save MarioVilas/de6f088b6fe11f765971411ab226d84f to your computer and use it in GitHub Desktop.
Goodreads quote downloader script
#!/usr/bin/env python3
# Fortune cookie generator based on Goodreads quotes
# --------------------------------------------------
#
# Use the following commands to download quotes from a specific author (for example, Mark Twain):
#
# python3 goodreads-quote-downloader.py -e3 https://www.goodreads.com/author/quotes/1244.Mark_Twain > marktwain
# strfile -c % marktwain marktwain.dat
# sudo cp marktwain marktwain.dat /usr/share/games/fortunes/
import argparse
import os
import os.path
import sys
from http.cookiejar import LWPCookieJar
from urllib.request import Request, urlopen
from urllib.parse import unquote
# Exported symbols.
__all__ = ['GoodreadsQuoteDownloader']
# We need to do this very early so the following code can show the help message on error.
ARGUMENT_PARSER = argparse.ArgumentParser()
ARGUMENT_PARSER.add_argument("url", metavar="URL", default="https://www.goodreads.com/quotes", help="Link to quotes page (default: https://www.goodreads.com/quotes)")
ARGUMENT_PARSER.add_argument("--start", "-s", metavar="N", type=int, default=0, help="start page (default: 0)")
ARGUMENT_PARSER.add_argument("--end", "-e", metavar="N", type=int, default=0, help="end page (default: no limit)")
ARGUMENT_PARSER.add_argument("--pause", "-p", metavar="N", type=int, default=0, help="pause between HTTP requests in seconds (default: 0)")
ARGUMENT_PARSER.add_argument("--user-agent", metavar="STR", help="custom User-agent header for HTTP requests")
# Try to import the non built-in dependencies. Show an error message on failure.
try:
from bs4 import BeautifulSoup
is_bs4 = True
except ImportError:
ARGUMENT_PARSER.error("missing dependency: BeautifulSoup\n\nRun the following command to install:\n\tpip3 install bs4")
class GoodreadsQuoteDownloader:
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
def __init__(self, home_folder=None):
if not home_folder:
home_folder = os.getenv('HOME')
if not home_folder:
home_folder = os.getenv('USERHOME')
if not home_folder:
home_folder = '.' # Use the current folder on error.
self.cookie_jar = LWPCookieJar(os.path.join(home_folder, '.goodreads-cookie'))
try:
self.cookie_jar.load()
except Exception:
pass
def get_quotes_from_page(self, url, page=0):
# Prepare the URL.
if "?" in url:
url = url[:url.find("?")]
if page > 0:
url = url + "?page=" + str(page)
# Prepare the HTTP request.
request = Request(url)
if self.USER_AGENT:
request.add_header('User-Agent', self.USER_AGENT)
# Get cookies from the cookie jar.
self.cookie_jar.add_cookie_header(request)
# Make the HTTP request and get the HTML response.
response = urlopen(request)
# Store cookies in the cookie jar.
self.cookie_jar.extract_cookies(response, request)
html = response.read()
response.close()
try:
self.cookie_jar.save()
except Exception:
pass
# Parse the HTML response and extract the quotes.
return [
unquote(div.get_text()).replace(" ", " ").replace("\n\n", "\n").strip()
for div in BeautifulSoup(html, 'html.parser').find_all("div", class_="quoteText")
]
def iter_quotes(self, url, start=0, end=0, pause=0):
page = start
while end == 0 or page <= end:
quotes = self.get_quotes_from_page(url, page)
if not quotes:
break
for q in quotes:
yield q
page += 1
def main():
args = ARGUMENT_PARSER.parse_args()
gqd = GoodreadsQuoteDownloader()
if args.user_agent:
gqd.USER_AGENT = args.user_agent
separator = None
for quote in gqd.iter_quotes(args.url, start=args.start, end=args.end, pause=args.pause):
if separator:
print(separator)
print(quote)
separator = "%"
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment