kizernis/ibc.py

## ibc.py
import os
import re
import sys
import csv
import shutil
# import pickle
import requests
import threading
import concurrent.futures
from glob import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from configparser import RawConfigParser

config = RawConfigParser()
config.read('settings.cfg')
login = config.get('General', 'login').strip()
password = config.get('General', 'password').strip()
output_file = config.get('General', 'output_file').strip()
assert login and password and output_file

temp_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'temp')
if os.path.isdir(temp_dir_path):
    shutil.rmtree(temp_dir_path)
os.mkdir(temp_dir_path)

post_data = {
    'user': login,
    'pass': password,
    'permalogin': '0',
    'logintype': 'login',
    'pid': '4'
}

print('Logging in...')
session = requests.Session()
session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
response = session.post('https://portal.ibc-solar.de/', data=post_data)
if 'Abmelden' not in response.text:
    print('Unable to login.')
    sys.exit(1)
# with open('cookies', 'wb') as f:
#     pickle.dump(session.cookies, f)
cookies = session.cookies


print('Downloading subcategories...')
category_urls = [
    'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/',
    'https://shop.ibc-solar.de/shop/de/shop/PV/Wechselrichter/',
    'https://shop.ibc-solar.de/shop/de/shop/PV/Speicher/',
    'https://shop.ibc-solar.de/shop/de/shop/PV/Zubehoer/'
]
subcategory_urls = [
    'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/Module/'
]

for current_category_url in tqdm(category_urls[1:]):
    current_page_number = 1
    while True:
        soup = BeautifulSoup(session.get(f'{current_category_url}?page={current_page_number}').text, 'lxml')
        for soup_link in soup.find_all('a', class_='categorylist__item'):
            subcategory_urls.append('https://shop.ibc-solar.de{}'.format(soup_link.get('href')))

        if current_page_number == 1:
            soup_div = soup.find('div', class_='pagination__item pagination__item--text') # item?
            if soup_div is None:
                pages_total_number = 1
            else:
                pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0])
        if current_page_number < pages_total_number:
            current_page_number += 1
        else:
            break


print('Downloading items list...')
item_urls = []
for current_subcategory_url in tqdm(subcategory_urls):
    current_page_number = 1
    while True:
        soup = BeautifulSoup(session.get(f'{current_subcategory_url}?page={current_page_number}').text, 'lxml')
        for soup_link in soup.find_all('a', class_='itemlist__wrapper'):
            url = soup_link.get('href')
            if url not in item_urls:
                item_urls.append(url)

        if current_page_number == 1:
            soup_div = soup.find('div', class_='pagination__item pagination__item--text')
            if soup_div is None:
                pages_total_number = 1
            else:
                pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0])
        if current_page_number < pages_total_number:
            current_page_number += 1
        else:
            break

item_urls = list(enumerate(item_urls, start=1))


# import json
# json.dump(item_urls, open('temp.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=1)
# import json
# item_urls = json.load(open('temp.json', encoding='utf-8'))


print('Downloading items...')
thread_local = threading.local()

def get_session():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def download_item_page(item_url):
    session = get_session()
    session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
    # with open('cookies', 'rb') as f:
    #     session.cookies.update(pickle.load(f))
    session.cookies.update(cookies)
    page_file_path = os.path.join(temp_dir_path, f'item{item_url[0]:07d}.html')
    while True:
        try:
            with open(page_file_path, 'w', encoding='utf-8', newline='\n') as f:
                f.write(session.get(f'https://shop.ibc-solar.de{item_url[1]}', timeout=60).text)
        except:
            pass
        finally:
            if os.path.isfile(page_file_path) and os.stat(page_file_path).st_size > 50000:
                break
    progress_bar.update()

progress_bar = tqdm(total=len(item_urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    executor.map(download_item_page, item_urls)
progress_bar.close()


print('Processing...')
csv_column_names = ('name', 'article no.', 'price', 'stock', 'future_delivery_date1', 'future_delivery_stock1', 'future_delivery_date2', 'future_delivery_stock2', 'future_delivery_date3', 'future_delivery_stock3', 'future_delivery_date4', 'future_delivery_stock4', 'future_delivery_date5', 'future_delivery_stock5', 'future_delivery_date6', 'future_delivery_stock6', 'future_delivery_date7', 'future_delivery_stock7', 'future_delivery_date8', 'future_delivery_stock8')

f_out = open(output_file, 'w', newline='', encoding='utf-8')
writer = csv.DictWriter(f_out, fieldnames=csv_column_names)
writer.writeheader()

try:
    for file_path in tqdm(sorted(glob(os.path.join(temp_dir_path, 'item*.html')))):
        with open(file_path, encoding='utf-8') as f_in:
            soup = BeautifulSoup(f_in, 'lxml')

        row = {}
        row['name'] = soup.find('span', itemprop='name').text.strip()
        row['article no.'] = soup.find('div', class_='itemcardItemno').text.strip().replace('Artikel-Nr.: ', '')
        row['price'] = soup.find('div', class_='base_price').text.replace('€', '').replace(',-', ',00').strip()

        if soup.find('div', class_='inventory__label').text.strip() == 'verfügbar':
            row['stock'] = 'yes'
        else:
            row['stock'] = 0
        soup_div = soup.find('div', class_='table_cell datecell').parent.parent.parent.parent.parent.parent
        soup_script = soup_div.find_all('script')[1]
        match = re.match(r'^.+data: \[([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*)\]', soup_script.string, flags=re.DOTALL)
        for n, soup_cell in enumerate(soup_div.find_all('div', class_='table_cell datecell'), start=1):
            # 20.04.-26.04.2020
            m = re.match('([^-]+).+(\d{4})$', soup_cell.text.strip())
            row[f'future_delivery_date{n}'] = m[1] + m[2] if m else ''
            row[f'future_delivery_stock{n}'] = match[n]

        writer.writerow(row)
finally:
    f_out.close()

shutil.rmtree(temp_dir_path)
# os.remove('cookies')

print('Success!')
	import os
	import re
	import sys
	import csv
	import shutil
	# import pickle
	import requests
	import threading
	import concurrent.futures
	from glob import glob
	from tqdm import tqdm
	from bs4 import BeautifulSoup
	from configparser import RawConfigParser

	config = RawConfigParser()
	config.read('settings.cfg')
	login = config.get('General', 'login').strip()
	password = config.get('General', 'password').strip()
	output_file = config.get('General', 'output_file').strip()
	assert login and password and output_file

	temp_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'temp')
	if os.path.isdir(temp_dir_path):
	shutil.rmtree(temp_dir_path)
	os.mkdir(temp_dir_path)

	post_data = {
	'user': login,
	'pass': password,
	'permalogin': '0',
	'logintype': 'login',
	'pid': '4'
	}

	print('Logging in...')
	session = requests.Session()
	session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
	response = session.post('https://portal.ibc-solar.de/', data=post_data)
	if 'Abmelden' not in response.text:
	print('Unable to login.')
	sys.exit(1)
	# with open('cookies', 'wb') as f:
	# pickle.dump(session.cookies, f)
	cookies = session.cookies


	print('Downloading subcategories...')
	category_urls = [
	'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/',
	'https://shop.ibc-solar.de/shop/de/shop/PV/Wechselrichter/',
	'https://shop.ibc-solar.de/shop/de/shop/PV/Speicher/',
	'https://shop.ibc-solar.de/shop/de/shop/PV/Zubehoer/'
	]
	subcategory_urls = [
	'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/Module/'
	]

	for current_category_url in tqdm(category_urls[1:]):
	current_page_number = 1
	while True:
	soup = BeautifulSoup(session.get(f'{current_category_url}?page={current_page_number}').text, 'lxml')
	for soup_link in soup.find_all('a', class_='categorylist__item'):
	subcategory_urls.append('https://shop.ibc-solar.de{}'.format(soup_link.get('href')))

	if current_page_number == 1:
	soup_div = soup.find('div', class_='pagination__item pagination__item--text') # item?
	if soup_div is None:
	pages_total_number = 1
	else:
	pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0])
	if current_page_number < pages_total_number:
	current_page_number += 1
	else:
	break


	print('Downloading items list...')
	item_urls = []
	for current_subcategory_url in tqdm(subcategory_urls):
	current_page_number = 1
	while True:
	soup = BeautifulSoup(session.get(f'{current_subcategory_url}?page={current_page_number}').text, 'lxml')
	for soup_link in soup.find_all('a', class_='itemlist__wrapper'):
	url = soup_link.get('href')
	if url not in item_urls:
	item_urls.append(url)

	if current_page_number == 1:
	soup_div = soup.find('div', class_='pagination__item pagination__item--text')
	if soup_div is None:
	pages_total_number = 1
	else:
	pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0])
	if current_page_number < pages_total_number:
	current_page_number += 1
	else:
	break

	item_urls = list(enumerate(item_urls, start=1))


	# import json
	# json.dump(item_urls, open('temp.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=1)
	# import json
	# item_urls = json.load(open('temp.json', encoding='utf-8'))


	print('Downloading items...')
	thread_local = threading.local()

	def get_session():
	if not hasattr(thread_local, "session"):
	thread_local.session = requests.Session()
	return thread_local.session

	def download_item_page(item_url):
	session = get_session()
	session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
	# with open('cookies', 'rb') as f:
	# session.cookies.update(pickle.load(f))
	session.cookies.update(cookies)
	page_file_path = os.path.join(temp_dir_path, f'item{item_url[0]:07d}.html')
	while True:
	try:
	with open(page_file_path, 'w', encoding='utf-8', newline='\n') as f:
	f.write(session.get(f'https://shop.ibc-solar.de{item_url[1]}', timeout=60).text)
	except:
	pass
	finally:
	if os.path.isfile(page_file_path) and os.stat(page_file_path).st_size > 50000:
	break
	progress_bar.update()

	progress_bar = tqdm(total=len(item_urls))
	with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
	executor.map(download_item_page, item_urls)
	progress_bar.close()


	print('Processing...')
	csv_column_names = ('name', 'article no.', 'price', 'stock', 'future_delivery_date1', 'future_delivery_stock1', 'future_delivery_date2', 'future_delivery_stock2', 'future_delivery_date3', 'future_delivery_stock3', 'future_delivery_date4', 'future_delivery_stock4', 'future_delivery_date5', 'future_delivery_stock5', 'future_delivery_date6', 'future_delivery_stock6', 'future_delivery_date7', 'future_delivery_stock7', 'future_delivery_date8', 'future_delivery_stock8')

	f_out = open(output_file, 'w', newline='', encoding='utf-8')
	writer = csv.DictWriter(f_out, fieldnames=csv_column_names)
	writer.writeheader()

	try:
	for file_path in tqdm(sorted(glob(os.path.join(temp_dir_path, 'item*.html')))):
	with open(file_path, encoding='utf-8') as f_in:
	soup = BeautifulSoup(f_in, 'lxml')

	row = {}
	row['name'] = soup.find('span', itemprop='name').text.strip()
	row['article no.'] = soup.find('div', class_='itemcardItemno').text.strip().replace('Artikel-Nr.: ', '')
	row['price'] = soup.find('div', class_='base_price').text.replace('€', '').replace(',-', ',00').strip()

	if soup.find('div', class_='inventory__label').text.strip() == 'verfügbar':
	row['stock'] = 'yes'
	else:
	row['stock'] = 0
	soup_div = soup.find('div', class_='table_cell datecell').parent.parent.parent.parent.parent.parent
	soup_script = soup_div.find_all('script')[1]
	match = re.match(r'^.+data: \[([^,]),\s([^,]),\s([^,]),\s([^,]),\s([^,]),\s([^,]),\s([^,]),\s([^,]*)\]', soup_script.string, flags=re.DOTALL)
	for n, soup_cell in enumerate(soup_div.find_all('div', class_='table_cell datecell'), start=1):
	# 20.04.-26.04.2020
	m = re.match('([^-]+).+(\d{4})$', soup_cell.text.strip())
	row[f'future_delivery_date{n}'] = m[1] + m[2] if m else ''
	row[f'future_delivery_stock{n}'] = match[n]

	writer.writerow(row)
	finally:
	f_out.close()

	shutil.rmtree(temp_dir_path)
	# os.remove('cookies')

	print('Success!')