Skip to content

Instantly share code, notes, and snippets.

@impshum
Created November 8, 2020 18:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save impshum/c0ca6b401aeb69276cd286aa3a08f630 to your computer and use it in GitHub Desktop.
Save impshum/c0ca6b401aeb69276cd286aa3a08f630 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from splinter import Browser
from selenium.webdriver.chrome.options import Options
import csv
categories = [
{'Arbeitsspeicher': {'pages': []}},
{'Capture-Karten': {'pages': []}},
{'PC-Komponenten': {'pages': ['CPUs', 'Festplatten', 'Laufwerke', 'Mainboards']}},
{'Grafikkarten': {'pages': []}},
{'Netzteile': {'pages': []}},
{'Netzwerkkarten': {'pages': []}},
{'PC-Gehäuse': {'pages': []}},
{'PC-Gehäuselüfter': {'pages': []}},
{'Soundkarten': {'pages': []}},
{'SSD': {'pages': []}},
{'Wasserkühlungen': {'pages': []}}
]
fieldnames = ['category', 'name', 'price', 'stock', 'url']
csv_file = 'products.csv'
results = []
def scraper(browser, url, cat, page):
browser.visit(url)
while True:
if browser.is_element_visible_by_css('.productBox', wait_time=2):
soup = BeautifulSoup(browser.html, 'lxml')
products = soup.find_all('a', {'class': 'productBox'})
print(f'found {len(products)} {cat} products')
for a in products:
url = a['href']
for row in a.find_all('div', {'class': 'my-3'}):
name = row.find('div', {'class': 'product-name'})
brand = name.find('span').text
name = name.get_text(strip=True).replace(brand, f'{brand} ')
price = row.find('span', {'class': 'price'}).get_text(strip=True).replace('CHF ', '')
delivery_info = row.find('div', {'class': 'delivery-info'}).get_text(strip=True)
results.append({'category': cat, 'name': name, 'price': price, 'stock': delivery_info, 'url': url})
break # END LIST PAGE
elif browser.is_element_visible_by_css('.product-carousel-card', wait_time=2):
soup = BeautifulSoup(browser.html, 'lxml')
products = soup.find_all('div', {'class': 'product-carousel-card'})
print(f'found {len(products)} {page} products')
for card in products:
url = card.find('a', href=True)['href']
brand = card.find('div', {'class': 'manufacturer'}).text
name = card.find('div', {'class': 'product-name'}).text
name = f'{brand} {name}'
price = card.find('div', {'class': 'price'}).get_text(strip=True).replace('CHF ', '')
browser.execute_script(f'window.open("{url}");')
browser.windows.current = browser.windows[1]
soup = BeautifulSoup(browser.html, 'lxml')
delivery_info = soup.find('b').get_text(strip=True)
browser.windows[1].close()
browser.windows.current = browser.windows[0]
results.append({'category': cat, 'name': name, 'price': price, 'stock': delivery_info, 'url': url})
break # END CAROUSEL PAGE
def writer(csv_file):
with open(csv_file, mode='w') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for result in results:
writer.writerow(result)
def main():
chrome_options = Options()
chrome_options.add_extension('ublock_origin.crx')
with Browser('chrome', chrome_options=chrome_options) as browser:
for category in categories:
for cat, all_pages in category.items():
pages = all_pages['pages']
if pages:
for page in pages:
page_url = f'{cat}/{page}/'
url = f'https://www.alternate.ch/{page_url}?lpf=9999'
scraper(browser, url, cat, page)
else:
page = ''
url = f'https://www.alternate.ch/{cat}/?lpf=9999'
scraper(browser, url, cat, page)
writer(csv_file)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment