Skip to content

Instantly share code, notes, and snippets.

Created August 8, 2020 15:15
Show Gist options
  • Save LuisArteaga/c9ab15bd9ee5db5880e66cb243ed60c3 to your computer and use it in GitHub Desktop.
Save LuisArteaga/c9ab15bd9ee5db5880e66cb243ed60c3 to your computer and use it in GitHub Desktop. Webscraper: FOM Thesis 2020
import ntpath
from selenium import webdriver as webdriver
from import By
import re
import csv
class Configuration:
This class contains the information about the chrome driver and the details about website to be crawled
def __init__(self, chrome_driver='./driver/chromedriver.exe'):
self.driver = ntpath.normpath(chrome_driver)
self.websites = {'': {'stocks': ['adidas',
'base_url': '',
'url_parameters': '-news@intpagenr_'}}
configuration = Configuration()
driver = configuration.driver
driver = webdriver.Chrome(executable_path=driver)
BASE_URL = configuration.websites['']['base_url']
for stock in configuration.websites['']['stocks']:
url = BASE_URL + stock + configuration.websites['']['url_parameters']
for i in range(1, 26):
re_date = r'[0-9]{2}\.[0-9]{2}\.[0-9]{2}\n'
table_news = driver.find_element(By.XPATH, "//table[@class='table news-list']")
tr = table_news.find_elements(By.XPATH, '//tr')
with open('./export/{}-{}.csv'.format(stock, str(i)), 'w', newline='') as file:
writer = csv.writer(file)
for index, element in enumerate(tr):
if re.match(re_date, element.text):
re.sub(string=repr(element.text), pattern='(^\'|\'$|^"|"$)', repl='').split(sep='\\n'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment