Skip to content

Instantly share code, notes, and snippets.

Created December 1, 2020 12:54
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
Scraping Web - Inspiring Quotes
Python Script to scrap amazing quotes by some great computer scientists
import os
from typing import List
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from bs4.element import Tag
QUOTES_FILENAME = "/quotes.txt"
QUOTES_TXT_PATH = os.getcwd() + "/quotes"
def get_bs4_obj(url: str) -> BeautifulSoup:
Get BeautifulSoup object for given QOUTES_URL.
# See reason to use Request:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs4Obj = BeautifulSoup(html, 'html.parser')
return bs4Obj
def get_ol_tags(bs4Obj: BeautifulSoup) -> List[Tag]:
Get all ol tags from the bs4 obj.
Note: It is the requirement for given QUOTES_URL, it shall be different for different URL to scrap.
allOL = bs4Obj.find_all('ol')
allReleventOL = list(filter(lambda ol: ol.attrs.get('class')!=['commentlist'], allOL))
return allReleventOL
def get_all_quotes(oltags: List[Tag]):
Yield all qoutes present in OL tags.
for ol in oltags:
yield ol.find('li').get_text()
def save_qoutes(oltags: List[Tag]):
Save extracted qoutes in a text file, create a new folder if not already present
if not os.path.exists(QUOTES_TXT_PATH):
with open(QUOTES_FILE_PATH, 'w') as file:
for txt in get_all_quotes(oltags):
print(f'All Quotes written to file: {QUOTES_FILE_PATH}')
if __name__ == "__main__":
bs4Obj = get_bs4_obj(QUOTES_URL)
olTags = get_ol_tags(bs4Obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment