Skip to content

Instantly share code, notes, and snippets.

@LordGhostX
Last active January 14, 2021 15:29
Show Gist options
  • Save LordGhostX/b2a5be6586ae2cfe56b47d2b80d6ae89 to your computer and use it in GitHub Desktop.
Save LordGhostX/b2a5be6586ae2cfe56b47d2b80d6ae89 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
def get_quotes(tag, page):
quotes = []
r = requests.get(f"http://quotes.toscrape.com/tag/{tag}/page/{page}/")
if r.status_code == 200:
quotes_page = BeautifulSoup(r.text, "html.parser")
for i in quotes_page.find_all("div", {"class": "quote"}):
try:
quotes.append({
"text": i.find("span").text.strip()[1:-1],
"author": i.find("small").text.strip(),
"tags": [j.text.strip() for j in i.find("div", {"class": "tags"}).find_all("a")]
})
except:
pass
return quotes
def main(tag, start, end):
quotes = []
if end == -1:
end = start
while True:
new_quotes = get_quotes(tag, end)
quotes += new_quotes
if new_quotes == []:
break
end += 1
end -= 1
else:
for page in range(start, end + 1):
quotes += get_quotes(tag, page)
with open(f"{tag}-quotes-{start}-{end}.csv", "w") as f:
f.write("author;text;tags\n")
for i in quotes:
f.write(f"{i['author']};{i['text']};{','.join(i['tags'])}\n")
if __name__ == "__main__":
tag = input("Enter quotes tag you want to scrape e.g love, life: ")
start = int(input("Enter page to start scraping from e.g 1, 5, 3: "))
end = int(input("Enter page to stop scraping from (-1 means unlimited): "))
main(tag, start, end)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment