Last active
December 21, 2023 08:05
-
-
Save L480/cceeb06c58c614a304e167f9fe0873d0 to your computer and use it in GitHub Desktop.
Scrapes URLs from gelbeseiten.de
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import base64 | |
import json | |
import argparse | |
urls = [] | |
def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True): | |
import sys | |
# UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8 | |
utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"] | |
perc = 100 * float(step) / float(total_steps) | |
max_ticks = bar_width * 8 | |
num_ticks = int(round(perc / 100 * max_ticks)) | |
full_ticks = num_ticks / 8 # Number of full blocks | |
part_ticks = num_ticks % 8 # Size of partial block (array index) | |
disp = bar = "" # Blank out variables | |
bar += utf_8s[0] * int(full_ticks) # Add full blocks into Progress Bar | |
# If part_ticks is zero, then no partial block, else append part char | |
if part_ticks > 0: | |
bar += utf_8s[part_ticks] | |
# Pad Progress Bar with fill character | |
bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0)) | |
if len(title) > 0: | |
disp = title + ": " # Optional title to progress display | |
# Print progress bar in green: https://stackoverflow.com/a/21786287/6929343 | |
disp += "\x1b[0;32m" # Color Green | |
disp += bar # Progress bar to progress display | |
disp += "\x1b[0m" # Color Reset | |
if print_perc: | |
# If requested, append percentage complete to progress display | |
if perc > 100.0: | |
perc = 100.0 # Fix "100.04 %" rounding error | |
disp += " {:6.2f}".format(perc) + " %" | |
# Output to terminal repetitively over the same line using '\r'. | |
sys.stdout.write("\r" + disp) | |
sys.stdout.flush() | |
def parse_body(resp): | |
base64_urls = re.findall( | |
r'(?s)(?<=webseiteLink=\\").*?(?=\\">)', resp) | |
for url in base64_urls: | |
urls.append(base64.b64decode(url).decode('utf-8')) | |
def scrape(search, location, radius): | |
url = 'https://www.gelbeseiten.de/ajaxsuche' | |
data = {'umkreis': radius, | |
'verwandt': 'false', | |
'WAS': search, | |
'WO': location, | |
'position': '1', | |
'anzahl': '10', | |
'sortierung': 'relevanz'} | |
resp = requests.post(url, data=data) | |
if resp.status_code == 200: | |
parse_body(resp.text) | |
total_records = int(resp.json()['gesamtanzahlTreffer']) | |
print('Found {} records.'.format(total_records)) | |
print('Extracting URLs...') | |
while int(data['position']) <= total_records: | |
percent_complete(data['position'], total_records) | |
resp = requests.post(url, data=data) | |
if resp.status_code != 200: | |
print('Failed to get position {}. Retrying...'.format( | |
data['position'])) | |
continue | |
data['position'] = str(int(data['position']) + 10) | |
parse_body(resp.text) | |
print('\n') | |
else: | |
print('Failed to to get initial page.') | |
with open('{}.json'.format(search), 'w') as outfile: | |
json.dump(urls, outfile) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
prog='yellow-pages-scraper.py', | |
description='Scrapes URLs from gelbeseiten.de') | |
parser.add_argument('-s', dest='search', type=str, | |
required=True, help='search term') | |
parser.add_argument('-l', dest='location', type=str, | |
required=True, help='search location') | |
parser.add_argument('-r', dest='radius', type=str, | |
required=True, help='search radius') | |
args = parser.parse_args() | |
scrape(args.search, args.location, args.radius) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment