LaBlazer/zlate_stranky_name_scraper.py

## zlate_stranky_name_scraper.py
#Made by LBLZR_
#Blah, blah, blah, uses Apache license...

import requests, json, re, os.path, codecs, time, sys
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from time import sleep

NAMES_FILENAME = "sk_names.txt"
SURNAMES_FILENAME = "sk_surnames.txt"

#letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
letters = "QRSTUVWXYZ"
url = "https://www.zlatestranky.sk/osoby-az/{0}/{1}/"
user_agent = ""
rqs = requests.Session()
rqs.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36"})
names = []
surnames = []

def get_names(soup):
	global names, surnames
	for column in soup.find_all("ul", class_="col6 pl60 t-p0 p-p0"):
		for row in column.find_all("li"):
			text = row.get_text()

			#replace the bullshit
			text = re.sub(r"\b\w{1,2}\b|[-_,.]", "", text)
			text = re.sub(r"[ ]{2,}", " " , text)

			#get name and surname and capitalize first letter
			split = text.split()
			#check if we have both name and surname
			if(len(split) >= 2):
				name = split[1].lower().title()
				surname = split[0].lower().title()
				#print("Meno: {} Priezvisko: {}".format(meno, priezvisko))

				#and add it to our dict
				if not name in names:
					names.append(name)

				if not surname in surnames:
					surnames.append(surname)

def scrape(urll):
	r = rqs.get(urll)
	soup = BeautifulSoup(r.text, 'html.parser')
	get_names(soup)

def save_list(listt, filename):
	with codecs.open(filename, "w", "utf-8") as fp:
		#firstly we delete old entries
		fp.truncate()
		for item in listt:
  			fp.write("{}\r\n".format(item))

def load_list(filename):
	#check if exists
	if os.path.isfile(filename):
		with codecs.open(filename, "r", "utf-8") as fp:
			listt = fp.readlines()
			#remove whitespace characters at the end of each line
			listt = [x.strip() for x in listt]
			return listt
	return []

def update_progress(text, done, total):
    bar_length = 30
    progress = done / total
    block = int(round(bar_length*progress))

    text = "\r{0} [{1}] ({2}/{3})".format(text, "█"*block + "-"*(bar_length-block), done, total)
    if total == done:
    	text += "\n"

    sys.stdout.write(text)
    sys.stdout.flush()

names = load_list(NAMES_FILENAME)
surnames = load_list(SURNAMES_FILENAME)

print("This script was made by LBLZR_. Pls no steal, thank.")

for letter in letters:
	#get total number of pages per letter
	r = rqs.get(url.format(letter, 1))
	soup = BeautifulSoup(r.text, 'html.parser')

	paginate = soup.find("ul", class_="paginate ptb30 divider").find_all("a")

	total_pages = int(paginate[-2].get_text()) if len(paginate) > 1 else 1

	print("Scraping letter {} with total of {} pages".format(letter, total_pages))
	update_progress("Scraping...", 1, total_pages)
	get_names(soup)

	#if there is more than one page then continue...
	if total_pages > 1:
		with ThreadPoolExecutor(max_workers=15) as executor:
			for page in range(2, total_pages + 1):
				#print("Scraping page {}...".format(page))
				update_progress("Scraping...", page, total_pages)
				executor.submit(scrape, url.format(letter, page))
				#we sleep for a while...
				sleep(0.2)

			executor.shutdown(wait=True)

	print("Finished scraping letter {}, saving...".format(letter))

	save_list(names, NAMES_FILENAME)
	save_list(surnames, SURNAMES_FILENAME)
	#Made by LBLZR_
	#Blah, blah, blah, uses Apache license...

	import requests, json, re, os.path, codecs, time, sys
	from concurrent.futures import ThreadPoolExecutor
	from bs4 import BeautifulSoup
	from time import sleep

	NAMES_FILENAME = "sk_names.txt"
	SURNAMES_FILENAME = "sk_surnames.txt"

	#letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	letters = "QRSTUVWXYZ"
	url = "https://www.zlatestranky.sk/osoby-az/{0}/{1}/"
	user_agent = ""
	rqs = requests.Session()
	rqs.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36"})
	names = []
	surnames = []

	def get_names(soup):
	global names, surnames
	for column in soup.find_all("ul", class_="col6 pl60 t-p0 p-p0"):
	for row in column.find_all("li"):
	text = row.get_text()

	#replace the bullshit
	text = re.sub(r"\b\w{1,2}\b\|[-_,.]", "", text)
	text = re.sub(r"[ ]{2,}", " " , text)

	#get name and surname and capitalize first letter
	split = text.split()
	#check if we have both name and surname
	if(len(split) >= 2):
	name = split[1].lower().title()
	surname = split[0].lower().title()
	#print("Meno: {} Priezvisko: {}".format(meno, priezvisko))

	#and add it to our dict
	if not name in names:
	names.append(name)

	if not surname in surnames:
	surnames.append(surname)

	def scrape(urll):
	r = rqs.get(urll)
	soup = BeautifulSoup(r.text, 'html.parser')
	get_names(soup)

	def save_list(listt, filename):
	with codecs.open(filename, "w", "utf-8") as fp:
	#firstly we delete old entries
	fp.truncate()
	for item in listt:
	fp.write("{}\r\n".format(item))

	def load_list(filename):
	#check if exists
	if os.path.isfile(filename):
	with codecs.open(filename, "r", "utf-8") as fp:
	listt = fp.readlines()
	#remove whitespace characters at the end of each line
	listt = [x.strip() for x in listt]
	return listt
	return []

	def update_progress(text, done, total):
	bar_length = 30
	progress = done / total
	block = int(round(bar_length*progress))

	text = "\r{0} [{1}] ({2}/{3})".format(text, "█"block + "-"(bar_length-block), done, total)
	if total == done:
	text += "\n"

	sys.stdout.write(text)
	sys.stdout.flush()

	names = load_list(NAMES_FILENAME)
	surnames = load_list(SURNAMES_FILENAME)

	print("This script was made by LBLZR_. Pls no steal, thank.")

	for letter in letters:
	#get total number of pages per letter
	r = rqs.get(url.format(letter, 1))
	soup = BeautifulSoup(r.text, 'html.parser')

	paginate = soup.find("ul", class_="paginate ptb30 divider").find_all("a")

	total_pages = int(paginate[-2].get_text()) if len(paginate) > 1 else 1

	print("Scraping letter {} with total of {} pages".format(letter, total_pages))
	update_progress("Scraping...", 1, total_pages)
	get_names(soup)

	#if there is more than one page then continue...
	if total_pages > 1:
	with ThreadPoolExecutor(max_workers=15) as executor:
	for page in range(2, total_pages + 1):
	#print("Scraping page {}...".format(page))
	update_progress("Scraping...", page, total_pages)
	executor.submit(scrape, url.format(letter, page))
	#we sleep for a while...
	sleep(0.2)

	executor.shutdown(wait=True)

	print("Finished scraping letter {}, saving...".format(letter))

	save_list(names, NAMES_FILENAME)
	save_list(surnames, SURNAMES_FILENAME)