L480/yellow-pages-scraper.py

## yellow-pages-scraper.py
import requests
import re
import base64
import json
import argparse

urls = []


def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
    import sys

    # UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
    utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
    perc = 100 * float(step) / float(total_steps)
    max_ticks = bar_width * 8
    num_ticks = int(round(perc / 100 * max_ticks))
    full_ticks = num_ticks / 8      # Number of full blocks
    part_ticks = num_ticks % 8      # Size of partial block (array index)

    disp = bar = ""                 # Blank out variables
    bar += utf_8s[0] * int(full_ticks)  # Add full blocks into Progress Bar

    # If part_ticks is zero, then no partial block, else append part char
    if part_ticks > 0:
        bar += utf_8s[part_ticks]

    # Pad Progress Bar with fill character
    bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))

    if len(title) > 0:
        disp = title + ": "         # Optional title to progress display

    # Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
    disp += "\x1b[0;32m"            # Color Green
    disp += bar                     # Progress bar to progress display
    disp += "\x1b[0m"               # Color Reset
    if print_perc:
        # If requested, append percentage complete to progress display
        if perc > 100.0:
            perc = 100.0            # Fix "100.04 %" rounding error
        disp += " {:6.2f}".format(perc) + " %"

    # Output to terminal repetitively over the same line using '\r'.
    sys.stdout.write("\r" + disp)
    sys.stdout.flush()


def parse_body(resp):
    base64_urls = re.findall(
        r'(?s)(?<=webseiteLink=\\").*?(?=\\">)', resp)
    for url in base64_urls:
        urls.append(base64.b64decode(url).decode('utf-8'))


def scrape(search, location, radius):
    url = 'https://www.gelbeseiten.de/ajaxsuche'
    data = {'umkreis': radius,
            'verwandt': 'false',
            'WAS': search,
            'WO': location,
            'position': '1',
            'anzahl': '10',
            'sortierung': 'relevanz'}
    resp = requests.post(url, data=data)
    if resp.status_code == 200:
        parse_body(resp.text)
        total_records = int(resp.json()['gesamtanzahlTreffer'])
        print('Found {} records.'.format(total_records))
        print('Extracting URLs...')
        while int(data['position']) <= total_records:
            percent_complete(data['position'], total_records)
            resp = requests.post(url, data=data)
            if resp.status_code != 200:
                print('Failed to get position {}. Retrying...'.format(
                    data['position']))
                continue
            data['position'] = str(int(data['position']) + 10)
            parse_body(resp.text)
        print('\n')
    else:
        print('Failed to to get initial page.')

    with open('{}.json'.format(search), 'w') as outfile:
        json.dump(urls, outfile)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        prog='yellow-pages-scraper.py',
        description='Scrapes URLs from gelbeseiten.de')
    parser.add_argument('-s', dest='search', type=str,
                        required=True, help='search term')
    parser.add_argument('-l', dest='location', type=str,
                        required=True, help='search location')
    parser.add_argument('-r', dest='radius', type=str,
                        required=True, help='search radius')
    args = parser.parse_args()
    scrape(args.search, args.location, args.radius)
	import requests
	import re
	import base64
	import json
	import argparse

	urls = []


	def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
	import sys

	# UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
	utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
	perc = 100 * float(step) / float(total_steps)
	max_ticks = bar_width * 8
	num_ticks = int(round(perc / 100 * max_ticks))
	full_ticks = num_ticks / 8 # Number of full blocks
	part_ticks = num_ticks % 8 # Size of partial block (array index)

	disp = bar = "" # Blank out variables
	bar += utf_8s[0] * int(full_ticks) # Add full blocks into Progress Bar

	# If part_ticks is zero, then no partial block, else append part char
	if part_ticks > 0:
	bar += utf_8s[part_ticks]

	# Pad Progress Bar with fill character
	bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))

	if len(title) > 0:
	disp = title + ": " # Optional title to progress display

	# Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
	disp += "\x1b[0;32m" # Color Green
	disp += bar # Progress bar to progress display
	disp += "\x1b[0m" # Color Reset
	if print_perc:
	# If requested, append percentage complete to progress display
	if perc > 100.0:
	perc = 100.0 # Fix "100.04 %" rounding error
	disp += " {:6.2f}".format(perc) + " %"

	# Output to terminal repetitively over the same line using '\r'.
	sys.stdout.write("\r" + disp)
	sys.stdout.flush()


	def parse_body(resp):
	base64_urls = re.findall(
	r'(?s)(?<=webseiteLink=\\").*?(?=\\">)', resp)
	for url in base64_urls:
	urls.append(base64.b64decode(url).decode('utf-8'))


	def scrape(search, location, radius):
	url = 'https://www.gelbeseiten.de/ajaxsuche'
	data = {'umkreis': radius,
	'verwandt': 'false',
	'WAS': search,
	'WO': location,
	'position': '1',
	'anzahl': '10',
	'sortierung': 'relevanz'}
	resp = requests.post(url, data=data)
	if resp.status_code == 200:
	parse_body(resp.text)
	total_records = int(resp.json()['gesamtanzahlTreffer'])
	print('Found {} records.'.format(total_records))
	print('Extracting URLs...')
	while int(data['position']) <= total_records:
	percent_complete(data['position'], total_records)
	resp = requests.post(url, data=data)
	if resp.status_code != 200:
	print('Failed to get position {}. Retrying...'.format(
	data['position']))
	continue
	data['position'] = str(int(data['position']) + 10)
	parse_body(resp.text)
	print('\n')
	else:
	print('Failed to to get initial page.')

	with open('{}.json'.format(search), 'w') as outfile:
	json.dump(urls, outfile)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	prog='yellow-pages-scraper.py',
	description='Scrapes URLs from gelbeseiten.de')
	parser.add_argument('-s', dest='search', type=str,
	required=True, help='search term')
	parser.add_argument('-l', dest='location', type=str,
	required=True, help='search location')
	parser.add_argument('-r', dest='radius', type=str,
	required=True, help='search radius')
	args = parser.parse_args()
	scrape(args.search, args.location, args.radius)