Skip to content

Instantly share code, notes, and snippets.

@L480
Last active December 21, 2023 08:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save L480/cceeb06c58c614a304e167f9fe0873d0 to your computer and use it in GitHub Desktop.
Save L480/cceeb06c58c614a304e167f9fe0873d0 to your computer and use it in GitHub Desktop.
Scrapes URLs from gelbeseiten.de
import requests
import re
import base64
import json
import argparse
urls = []
def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
import sys
# UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
perc = 100 * float(step) / float(total_steps)
max_ticks = bar_width * 8
num_ticks = int(round(perc / 100 * max_ticks))
full_ticks = num_ticks / 8 # Number of full blocks
part_ticks = num_ticks % 8 # Size of partial block (array index)
disp = bar = "" # Blank out variables
bar += utf_8s[0] * int(full_ticks) # Add full blocks into Progress Bar
# If part_ticks is zero, then no partial block, else append part char
if part_ticks > 0:
bar += utf_8s[part_ticks]
# Pad Progress Bar with fill character
bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))
if len(title) > 0:
disp = title + ": " # Optional title to progress display
# Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
disp += "\x1b[0;32m" # Color Green
disp += bar # Progress bar to progress display
disp += "\x1b[0m" # Color Reset
if print_perc:
# If requested, append percentage complete to progress display
if perc > 100.0:
perc = 100.0 # Fix "100.04 %" rounding error
disp += " {:6.2f}".format(perc) + " %"
# Output to terminal repetitively over the same line using '\r'.
sys.stdout.write("\r" + disp)
sys.stdout.flush()
def parse_body(resp):
base64_urls = re.findall(
r'(?s)(?<=webseiteLink=\\").*?(?=\\">)', resp)
for url in base64_urls:
urls.append(base64.b64decode(url).decode('utf-8'))
def scrape(search, location, radius):
url = 'https://www.gelbeseiten.de/ajaxsuche'
data = {'umkreis': radius,
'verwandt': 'false',
'WAS': search,
'WO': location,
'position': '1',
'anzahl': '10',
'sortierung': 'relevanz'}
resp = requests.post(url, data=data)
if resp.status_code == 200:
parse_body(resp.text)
total_records = int(resp.json()['gesamtanzahlTreffer'])
print('Found {} records.'.format(total_records))
print('Extracting URLs...')
while int(data['position']) <= total_records:
percent_complete(data['position'], total_records)
resp = requests.post(url, data=data)
if resp.status_code != 200:
print('Failed to get position {}. Retrying...'.format(
data['position']))
continue
data['position'] = str(int(data['position']) + 10)
parse_body(resp.text)
print('\n')
else:
print('Failed to to get initial page.')
with open('{}.json'.format(search), 'w') as outfile:
json.dump(urls, outfile)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog='yellow-pages-scraper.py',
description='Scrapes URLs from gelbeseiten.de')
parser.add_argument('-s', dest='search', type=str,
required=True, help='search term')
parser.add_argument('-l', dest='location', type=str,
required=True, help='search location')
parser.add_argument('-r', dest='radius', type=str,
required=True, help='search radius')
args = parser.parse_args()
scrape(args.search, args.location, args.radius)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment