iwalfy/bashorg_scraper.py

## bashorg_scraper.py
#!/usr/bin/env python3

import requests
import os
import threading

try:
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

BASE_URL = "http://bashorg.org"
START = 1
THREADS = 10

def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

def getLastQuote():
  resp = requests.get(BASE_URL)
  html = resp.text
  parse = BeautifulSoup(html, "html.parser")

  last_quote = parse.find("div", attrs={"class":"q"})
  quote_link = last_quote.find("a")
  link_text = quote_link.get_text()
  lastId = link_text.split("#")[1]

  return int(lastId)

def percentage(part, whole):
  return 100 * float(part)/float(whole)

def doScrape(start, end, ti):
  if not os.path.exists("bashorg"):
    os.mkdir("bashorg")

  for i in range(start, (end + 1)):
    if i % 10 == 0:
      print(f"T:{ti}\tWorking on {i} of {end + 1}...")

    quote_url = f"{BASE_URL}/quote/{i}"
    resp = requests.get(quote_url)
    html = resp.text

    parse = BeautifulSoup(html, "html.parser")
    quote_text_elem = parse.find("div", attrs={"class":"quote"})

    if not quote_text_elem:
      continue

    quote_text = ""

    for child in quote_text_elem.children:
      if child.name != "br":
        quote_text += child.get_text()
      else:
        quote_text += "\n"

    with open(f"bashorg/{i}.txt", "w") as file:
      file.write(quote_text)

def main():
  lastId = getLastQuote()
  print("Quotes will be saved to 'bashorg' directory")

  x = list(split(range(START, lastId), THREADS))
  for ti, i in enumerate(x):
    y = i[0]
    z = i[-1]

    t = threading.Thread(target=doScrape, args=(y, z, ti))
    t.start()

if __name__ == "__main__":
  main()
	#!/usr/bin/env python3

	import requests
	import os
	import threading

	try:
	from BeautifulSoup import BeautifulSoup
	except ImportError:
	from bs4 import BeautifulSoup

	BASE_URL = "http://bashorg.org"
	START = 1
	THREADS = 10

	def split(a, n):
	k, m = divmod(len(a), n)
	return (a[ik+min(i, m):(i+1)k+min(i+1, m)] for i in range(n))

	def getLastQuote():
	resp = requests.get(BASE_URL)
	html = resp.text
	parse = BeautifulSoup(html, "html.parser")

	last_quote = parse.find("div", attrs={"class":"q"})
	quote_link = last_quote.find("a")
	link_text = quote_link.get_text()
	lastId = link_text.split("#")[1]

	return int(lastId)

	def percentage(part, whole):
	return 100 * float(part)/float(whole)

	def doScrape(start, end, ti):
	if not os.path.exists("bashorg"):
	os.mkdir("bashorg")

	for i in range(start, (end + 1)):
	if i % 10 == 0:
	print(f"T:{ti}\tWorking on {i} of {end + 1}...")

	quote_url = f"{BASE_URL}/quote/{i}"
	resp = requests.get(quote_url)
	html = resp.text

	parse = BeautifulSoup(html, "html.parser")
	quote_text_elem = parse.find("div", attrs={"class":"quote"})

	if not quote_text_elem:
	continue

	quote_text = ""

	for child in quote_text_elem.children:
	if child.name != "br":
	quote_text += child.get_text()
	else:
	quote_text += "\n"

	with open(f"bashorg/{i}.txt", "w") as file:
	file.write(quote_text)

	def main():
	lastId = getLastQuote()
	print("Quotes will be saved to 'bashorg' directory")

	x = list(split(range(START, lastId), THREADS))
	for ti, i in enumerate(x):
	y = i[0]
	z = i[-1]

	t = threading.Thread(target=doScrape, args=(y, z, ti))
	t.start()

	if __name__ == "__main__":
	main()