Skip to content

Instantly share code, notes, and snippets.

@dusekdan
Created August 1, 2021 09:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dusekdan/0f828a5736fbb0d85a8f5cdc4b02ec9f to your computer and use it in GitHub Desktop.
Save dusekdan/0f828a5736fbb0d85a8f5cdc4b02ec9f to your computer and use it in GitHub Desktop.
Small script to scrape quotes from azquotes.com
import json
import logging as LOG
import requests
from bs4 import BeautifulSoup
LOG.basicConfig(level=LOG.INFO)
QUOTES_BASE_URL = 'https://www.azquotes.com/top_quotes.html?p='
OUTPUT_FILE = 'quotes-better.json'
MAX_PAGE_NUMBER_EXCLUSIVE = 11
def main():
LOG.info('Started')
quotes = []
for page in range(1, MAX_PAGE_NUMBER_EXCLUSIVE):
quotes = quotes + get_quotes_from_page(f"{QUOTES_BASE_URL}{page}")
LOG.info(f"Quotes scraped: {len(quotes)}")
with open(OUTPUT_FILE, "w") as f:
LOG.info(f"Writing {len(quotes)} quotes to file...")
json.dump(quotes, f)
LOG.info("Job's finished")
def get_quotes_from_page(url):
LOG.info(f"Retrieving quotes from {url}")
quotes = []
request = requests.get(url)
soup = BeautifulSoup(request.content, "html.parser")
quotes_container = soup.find("ul", class_="list-quotes")
for li in quotes_container.find_all("li"):
quote_info = li.find('a', {'class' : 'title'})
quotes.append({
f"quote-{quote_info['href'].split('/quote/')[1]}" : {
"content": quote_info.text,
"author": soup.select('.author > a')[0].text
}
})
LOG.info(f"{len(quotes)} quotes retrieved")
return quotes
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment