Skip to content

Instantly share code, notes, and snippets.

@LaBlazer
Last active June 10, 2020 06:42
Show Gist options
  • Save LaBlazer/5132a4bf6f722f92347d5f5a3345cc01 to your computer and use it in GitHub Desktop.
Save LaBlazer/5132a4bf6f722f92347d5f5a3345cc01 to your computer and use it in GitHub Desktop.
Stript that scrapes the page zlatestranky.sk for names and surnames
#Made by LBLZR_
#Blah, blah, blah, uses Apache license...
import requests, json, re, os.path, codecs, time, sys
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from time import sleep
NAMES_FILENAME = "sk_names.txt"
SURNAMES_FILENAME = "sk_surnames.txt"
#letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
letters = "QRSTUVWXYZ"
url = "https://www.zlatestranky.sk/osoby-az/{0}/{1}/"
user_agent = ""
rqs = requests.Session()
rqs.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36"})
names = []
surnames = []
def get_names(soup):
global names, surnames
for column in soup.find_all("ul", class_="col6 pl60 t-p0 p-p0"):
for row in column.find_all("li"):
text = row.get_text()
#replace the bullshit
text = re.sub(r"\b\w{1,2}\b|[-_,.]", "", text)
text = re.sub(r"[ ]{2,}", " " , text)
#get name and surname and capitalize first letter
split = text.split()
#check if we have both name and surname
if(len(split) >= 2):
name = split[1].lower().title()
surname = split[0].lower().title()
#print("Meno: {} Priezvisko: {}".format(meno, priezvisko))
#and add it to our dict
if not name in names:
names.append(name)
if not surname in surnames:
surnames.append(surname)
def scrape(urll):
r = rqs.get(urll)
soup = BeautifulSoup(r.text, 'html.parser')
get_names(soup)
def save_list(listt, filename):
with codecs.open(filename, "w", "utf-8") as fp:
#firstly we delete old entries
fp.truncate()
for item in listt:
fp.write("{}\r\n".format(item))
def load_list(filename):
#check if exists
if os.path.isfile(filename):
with codecs.open(filename, "r", "utf-8") as fp:
listt = fp.readlines()
#remove whitespace characters at the end of each line
listt = [x.strip() for x in listt]
return listt
return []
def update_progress(text, done, total):
bar_length = 30
progress = done / total
block = int(round(bar_length*progress))
text = "\r{0} [{1}] ({2}/{3})".format(text, "█"*block + "-"*(bar_length-block), done, total)
if total == done:
text += "\n"
sys.stdout.write(text)
sys.stdout.flush()
names = load_list(NAMES_FILENAME)
surnames = load_list(SURNAMES_FILENAME)
print("This script was made by LBLZR_. Pls no steal, thank.")
for letter in letters:
#get total number of pages per letter
r = rqs.get(url.format(letter, 1))
soup = BeautifulSoup(r.text, 'html.parser')
paginate = soup.find("ul", class_="paginate ptb30 divider").find_all("a")
total_pages = int(paginate[-2].get_text()) if len(paginate) > 1 else 1
print("Scraping letter {} with total of {} pages".format(letter, total_pages))
update_progress("Scraping...", 1, total_pages)
get_names(soup)
#if there is more than one page then continue...
if total_pages > 1:
with ThreadPoolExecutor(max_workers=15) as executor:
for page in range(2, total_pages + 1):
#print("Scraping page {}...".format(page))
update_progress("Scraping...", page, total_pages)
executor.submit(scrape, url.format(letter, page))
#we sleep for a while...
sleep(0.2)
executor.shutdown(wait=True)
print("Finished scraping letter {}, saving...".format(letter))
save_list(names, NAMES_FILENAME)
save_list(surnames, SURNAMES_FILENAME)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment