Skip to content

Instantly share code, notes, and snippets.

@iwalfy
Last active June 24, 2023 06:22
Show Gist options
  • Save iwalfy/e69e256e010e4d1d1b22a73bc0f0d7a9 to your computer and use it in GitHub Desktop.
Save iwalfy/e69e256e010e4d1d1b22a73bc0f0d7a9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import requests
import os
import threading
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
BASE_URL = "http://bashorg.org"
START = 1
THREADS = 10
def split(a, n):
k, m = divmod(len(a), n)
return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
def getLastQuote():
resp = requests.get(BASE_URL)
html = resp.text
parse = BeautifulSoup(html, "html.parser")
last_quote = parse.find("div", attrs={"class":"q"})
quote_link = last_quote.find("a")
link_text = quote_link.get_text()
lastId = link_text.split("#")[1]
return int(lastId)
def percentage(part, whole):
return 100 * float(part)/float(whole)
def doScrape(start, end, ti):
if not os.path.exists("bashorg"):
os.mkdir("bashorg")
for i in range(start, (end + 1)):
if i % 10 == 0:
print(f"T:{ti}\tWorking on {i} of {end + 1}...")
quote_url = f"{BASE_URL}/quote/{i}"
resp = requests.get(quote_url)
html = resp.text
parse = BeautifulSoup(html, "html.parser")
quote_text_elem = parse.find("div", attrs={"class":"quote"})
if not quote_text_elem:
continue
quote_text = ""
for child in quote_text_elem.children:
if child.name != "br":
quote_text += child.get_text()
else:
quote_text += "\n"
with open(f"bashorg/{i}.txt", "w") as file:
file.write(quote_text)
def main():
lastId = getLastQuote()
print("Quotes will be saved to 'bashorg' directory")
x = list(split(range(START, lastId), THREADS))
for ti, i in enumerate(x):
y = i[0]
z = i[-1]
t = threading.Thread(target=doScrape, args=(y, z, ti))
t.start()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment