jurand71/selenium_bg_pcz_publications_scraping.py Secret

## selenium_bg_pcz_publications_scraping.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time

driver = webdriver.Chrome()
driver.get('https://bg.pcz.pl/apisnb')

# prepare unit information
unit = driver.find_element(By.NAME, "unit")
unit.send_keys("Wydział Infrastruktury i Środowiska")

# prepare year_from information
year_from = driver.find_element(By.NAME,"year_from")
year_start = 2022
year_from.send_keys(year_start)

# prepare year_to information
year_to = driver.find_element(By.NAME,"year_to")
year_end = ""
if year_end != "":
    year_to.send_keys(year_end)

# execute a form with parameters
form_search = driver.find_element(By.NAME,"form_search")
form_search.send_keys(Keys.RETURN)

time.sleep(1)

# obtain a list of publications as a list
list_of_pub = driver.find_elements(By.XPATH, "//tbody/tr/td/a[@class='apisnb_book']")

list_of_publications = []

for n in range(len(list_of_pub)):

    driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

    results = driver.find_elements(By.XPATH, "//tbody/tr/td/a[@class='apisnb_book']")
    results[n].click()

    time.sleep(0.5)

    rows = driver.find_elements(By.XPATH, "//*[@class= 'thright']/tbody/tr")

    publikacja = {}
    for row in range(len(rows)):
        linia = rows[row].text.split(':', 1)
        if len(linia) != 1:
            publikacja[linia[0]] = linia[1]

    list_of_publications.append(publikacja)

    close_button = driver.find_element(By.XPATH, "//input[@name='book_close']")
    close_button.click()

    time.sleep(1)

driver.quit()

import pandas as pd

df = pd.DataFrame(list_of_publications, columns=['Tytuł','Autorzy','Tytuł czasopisma',
                                                 'Rok publikacji','Wolumin','Zeszyt/Issue','Strony','URL','Identyfikator DOI'])

df.to_csv('publications.csv')
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	import time

	driver = webdriver.Chrome()
	driver.get('https://bg.pcz.pl/apisnb')

	# prepare unit information
	unit = driver.find_element(By.NAME, "unit")
	unit.send_keys("Wydział Infrastruktury i Środowiska")

	# prepare year_from information
	year_from = driver.find_element(By.NAME,"year_from")
	year_start = 2022
	year_from.send_keys(year_start)

	# prepare year_to information
	year_to = driver.find_element(By.NAME,"year_to")
	year_end = ""
	if year_end != "":
	year_to.send_keys(year_end)

	# execute a form with parameters
	form_search = driver.find_element(By.NAME,"form_search")
	form_search.send_keys(Keys.RETURN)

	time.sleep(1)

	# obtain a list of publications as a list
	list_of_pub = driver.find_elements(By.XPATH, "//tbody/tr/td/a[@class='apisnb_book']")

	list_of_publications = []

	for n in range(len(list_of_pub)):

	driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

	results = driver.find_elements(By.XPATH, "//tbody/tr/td/a[@class='apisnb_book']")
	results[n].click()

	time.sleep(0.5)

	rows = driver.find_elements(By.XPATH, "//*[@class= 'thright']/tbody/tr")

	publikacja = {}
	for row in range(len(rows)):
	linia = rows[row].text.split(':', 1)
	if len(linia) != 1:
	publikacja[linia[0]] = linia[1]

	list_of_publications.append(publikacja)

	close_button = driver.find_element(By.XPATH, "//input[@name='book_close']")
	close_button.click()

	time.sleep(1)

	driver.quit()

	import pandas as pd

	df = pd.DataFrame(list_of_publications, columns=['Tytuł','Autorzy','Tytuł czasopisma',
	'Rok publikacji','Wolumin','Zeszyt/Issue','Strony','URL','Identyfikator DOI'])

	df.to_csv('publications.csv')