Skip to content

Instantly share code, notes, and snippets.

@kizernis
Last active May 8, 2020 04:28
Show Gist options
  • Save kizernis/7de905225c4d0051b26a35672a6ffa25 to your computer and use it in GitHub Desktop.
Save kizernis/7de905225c4d0051b26a35672a6ffa25 to your computer and use it in GitHub Desktop.
import os
import re
import sys
import csv
import shutil
# import pickle
import requests
import threading
import concurrent.futures
from glob import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from configparser import RawConfigParser
config = RawConfigParser()
config.read('settings.cfg')
login = config.get('General', 'login').strip()
password = config.get('General', 'password').strip()
output_file = config.get('General', 'output_file').strip()
assert login and password and output_file
temp_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'temp')
if os.path.isdir(temp_dir_path):
shutil.rmtree(temp_dir_path)
os.mkdir(temp_dir_path)
post_data = {
'user': login,
'pass': password,
'permalogin': '0',
'logintype': 'login',
'pid': '4'
}
print('Logging in...')
session = requests.Session()
session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
response = session.post('https://portal.ibc-solar.de/', data=post_data)
if 'Abmelden' not in response.text:
print('Unable to login.')
sys.exit(1)
# with open('cookies', 'wb') as f:
# pickle.dump(session.cookies, f)
cookies = session.cookies
print('Downloading subcategories...')
category_urls = [
'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/',
'https://shop.ibc-solar.de/shop/de/shop/PV/Wechselrichter/',
'https://shop.ibc-solar.de/shop/de/shop/PV/Speicher/',
'https://shop.ibc-solar.de/shop/de/shop/PV/Zubehoer/'
]
subcategory_urls = [
'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/Module/'
]
for current_category_url in tqdm(category_urls[1:]):
current_page_number = 1
while True:
soup = BeautifulSoup(session.get(f'{current_category_url}?page={current_page_number}').text, 'lxml')
for soup_link in soup.find_all('a', class_='categorylist__item'):
subcategory_urls.append('https://shop.ibc-solar.de{}'.format(soup_link.get('href')))
if current_page_number == 1:
soup_div = soup.find('div', class_='pagination__item pagination__item--text') # item?
if soup_div is None:
pages_total_number = 1
else:
pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0])
if current_page_number < pages_total_number:
current_page_number += 1
else:
break
print('Downloading items list...')
item_urls = []
for current_subcategory_url in tqdm(subcategory_urls):
current_page_number = 1
while True:
soup = BeautifulSoup(session.get(f'{current_subcategory_url}?page={current_page_number}').text, 'lxml')
for soup_link in soup.find_all('a', class_='itemlist__wrapper'):
url = soup_link.get('href')
if url not in item_urls:
item_urls.append(url)
if current_page_number == 1:
soup_div = soup.find('div', class_='pagination__item pagination__item--text')
if soup_div is None:
pages_total_number = 1
else:
pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0])
if current_page_number < pages_total_number:
current_page_number += 1
else:
break
item_urls = list(enumerate(item_urls, start=1))
# import json
# json.dump(item_urls, open('temp.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=1)
# import json
# item_urls = json.load(open('temp.json', encoding='utf-8'))
print('Downloading items...')
thread_local = threading.local()
def get_session():
if not hasattr(thread_local, "session"):
thread_local.session = requests.Session()
return thread_local.session
def download_item_page(item_url):
session = get_session()
session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
# with open('cookies', 'rb') as f:
# session.cookies.update(pickle.load(f))
session.cookies.update(cookies)
page_file_path = os.path.join(temp_dir_path, f'item{item_url[0]:07d}.html')
while True:
try:
with open(page_file_path, 'w', encoding='utf-8', newline='\n') as f:
f.write(session.get(f'https://shop.ibc-solar.de{item_url[1]}', timeout=60).text)
except:
pass
finally:
if os.path.isfile(page_file_path) and os.stat(page_file_path).st_size > 50000:
break
progress_bar.update()
progress_bar = tqdm(total=len(item_urls))
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
executor.map(download_item_page, item_urls)
progress_bar.close()
print('Processing...')
csv_column_names = ('name', 'article no.', 'price', 'stock', 'future_delivery_date1', 'future_delivery_stock1', 'future_delivery_date2', 'future_delivery_stock2', 'future_delivery_date3', 'future_delivery_stock3', 'future_delivery_date4', 'future_delivery_stock4', 'future_delivery_date5', 'future_delivery_stock5', 'future_delivery_date6', 'future_delivery_stock6', 'future_delivery_date7', 'future_delivery_stock7', 'future_delivery_date8', 'future_delivery_stock8')
f_out = open(output_file, 'w', newline='', encoding='utf-8')
writer = csv.DictWriter(f_out, fieldnames=csv_column_names)
writer.writeheader()
try:
for file_path in tqdm(sorted(glob(os.path.join(temp_dir_path, 'item*.html')))):
with open(file_path, encoding='utf-8') as f_in:
soup = BeautifulSoup(f_in, 'lxml')
row = {}
row['name'] = soup.find('span', itemprop='name').text.strip()
row['article no.'] = soup.find('div', class_='itemcardItemno').text.strip().replace('Artikel-Nr.: ', '')
row['price'] = soup.find('div', class_='base_price').text.replace('€', '').replace(',-', ',00').strip()
if soup.find('div', class_='inventory__label').text.strip() == 'verfügbar':
row['stock'] = 'yes'
else:
row['stock'] = 0
soup_div = soup.find('div', class_='table_cell datecell').parent.parent.parent.parent.parent.parent
soup_script = soup_div.find_all('script')[1]
match = re.match(r'^.+data: \[([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*)\]', soup_script.string, flags=re.DOTALL)
for n, soup_cell in enumerate(soup_div.find_all('div', class_='table_cell datecell'), start=1):
# 20.04.-26.04.2020
m = re.match('([^-]+).+(\d{4})$', soup_cell.text.strip())
row[f'future_delivery_date{n}'] = m[1] + m[2] if m else ''
row[f'future_delivery_stock{n}'] = match[n]
writer.writerow(row)
finally:
f_out.close()
shutil.rmtree(temp_dir_path)
# os.remove('cookies')
print('Success!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment