Last active
June 10, 2020 06:42
-
-
Save LaBlazer/5132a4bf6f722f92347d5f5a3345cc01 to your computer and use it in GitHub Desktop.
Stript that scrapes the page zlatestranky.sk for names and surnames
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Made by LBLZR_ | |
#Blah, blah, blah, uses Apache license... | |
import requests, json, re, os.path, codecs, time, sys | |
from concurrent.futures import ThreadPoolExecutor | |
from bs4 import BeautifulSoup | |
from time import sleep | |
NAMES_FILENAME = "sk_names.txt" | |
SURNAMES_FILENAME = "sk_surnames.txt" | |
#letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
letters = "QRSTUVWXYZ" | |
url = "https://www.zlatestranky.sk/osoby-az/{0}/{1}/" | |
user_agent = "" | |
rqs = requests.Session() | |
rqs.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36"}) | |
names = [] | |
surnames = [] | |
def get_names(soup): | |
global names, surnames | |
for column in soup.find_all("ul", class_="col6 pl60 t-p0 p-p0"): | |
for row in column.find_all("li"): | |
text = row.get_text() | |
#replace the bullshit | |
text = re.sub(r"\b\w{1,2}\b|[-_,.]", "", text) | |
text = re.sub(r"[ ]{2,}", " " , text) | |
#get name and surname and capitalize first letter | |
split = text.split() | |
#check if we have both name and surname | |
if(len(split) >= 2): | |
name = split[1].lower().title() | |
surname = split[0].lower().title() | |
#print("Meno: {} Priezvisko: {}".format(meno, priezvisko)) | |
#and add it to our dict | |
if not name in names: | |
names.append(name) | |
if not surname in surnames: | |
surnames.append(surname) | |
def scrape(urll): | |
r = rqs.get(urll) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
get_names(soup) | |
def save_list(listt, filename): | |
with codecs.open(filename, "w", "utf-8") as fp: | |
#firstly we delete old entries | |
fp.truncate() | |
for item in listt: | |
fp.write("{}\r\n".format(item)) | |
def load_list(filename): | |
#check if exists | |
if os.path.isfile(filename): | |
with codecs.open(filename, "r", "utf-8") as fp: | |
listt = fp.readlines() | |
#remove whitespace characters at the end of each line | |
listt = [x.strip() for x in listt] | |
return listt | |
return [] | |
def update_progress(text, done, total): | |
bar_length = 30 | |
progress = done / total | |
block = int(round(bar_length*progress)) | |
text = "\r{0} [{1}] ({2}/{3})".format(text, "█"*block + "-"*(bar_length-block), done, total) | |
if total == done: | |
text += "\n" | |
sys.stdout.write(text) | |
sys.stdout.flush() | |
names = load_list(NAMES_FILENAME) | |
surnames = load_list(SURNAMES_FILENAME) | |
print("This script was made by LBLZR_. Pls no steal, thank.") | |
for letter in letters: | |
#get total number of pages per letter | |
r = rqs.get(url.format(letter, 1)) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
paginate = soup.find("ul", class_="paginate ptb30 divider").find_all("a") | |
total_pages = int(paginate[-2].get_text()) if len(paginate) > 1 else 1 | |
print("Scraping letter {} with total of {} pages".format(letter, total_pages)) | |
update_progress("Scraping...", 1, total_pages) | |
get_names(soup) | |
#if there is more than one page then continue... | |
if total_pages > 1: | |
with ThreadPoolExecutor(max_workers=15) as executor: | |
for page in range(2, total_pages + 1): | |
#print("Scraping page {}...".format(page)) | |
update_progress("Scraping...", page, total_pages) | |
executor.submit(scrape, url.format(letter, page)) | |
#we sleep for a while... | |
sleep(0.2) | |
executor.shutdown(wait=True) | |
print("Finished scraping letter {}, saving...".format(letter)) | |
save_list(names, NAMES_FILENAME) | |
save_list(surnames, SURNAMES_FILENAME) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment