Skip to content

Instantly share code, notes, and snippets.

@marcogoldin
Last active April 22, 2019 05:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcogoldin/58e648a77226ce55ca639c8fae46ebe3 to your computer and use it in GitHub Desktop.
Save marcogoldin/58e648a77226ce55ca639c8fae46ebe3 to your computer and use it in GitHub Desktop.
Simple crawler
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
opts = Options()
opts.headless = True
driver = webdriver.Firefox(executable_path='../geckodriver', options=opts)
def crawler(codice_isbn):
driver.get('https://opac.sbn.it/opacsbn/opac/iccu/avanzata.jsp')
print(f'Titolo della pagina: {driver.title}')
isbn = driver.find_element_by_id('fieldval:4')
isbn.send_keys(str(codice_isbn))
isbn.submit()
driver.implicitly_wait(5)
click_link = driver.find_elements_by_xpath("//a[contains(@title, 'Risultato 1')]")
dizio_click_link = click_link[0].text
print(click_link[0].text)
driver.find_element_by_link_text(click_link[0].text).click()
print(f'Titolo della pagina: {driver.title}')
numeri = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[9]/td[1]")
numeri_isbn = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[9]/td[2]")
nomi = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[10]/td[1]")
nomi_autore = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[10]/td[2]")
soggetti1 = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[11]/td[1]")
soggetti2 = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[11]/td[2]")
cdd1 = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[12]/td[1]")
cdd2 = driver.find_element_by_xpath("//div/table[@id='details']/tbody/tr[12]/td[2]")
print(f'{numeri.text}:{numeri_isbn.text.replace("·","")} \n{nomi.text}:{nomi_autore.text.replace("·","").replace(" scheda di autorità","")}')
print(f'{soggetti1.text}:{soggetti2.text.replace("·","")} \n{cdd1.text}:{cdd2.text.replace("·","")}')
print(f'URL della pagina: {driver.current_url}')
url = driver.current_url
dizio = {'titolo':[],'isbn':[],'autore':[],'soggetti':[],'dewey':[],'url':[]}
dizio['titolo'].append(dizio_click_link)
dizio['isbn'].append(numeri_isbn.text.replace("·",""))
dizio['autore'].append(nomi_autore.text.replace("·","").replace(" scheda di autorità",""))
dizio['soggetti'].append(soggetti2.text.replace("·",""))
dizio['dewey'].append(cdd2.text.replace("·",""))
dizio['url'].append(driver.current_url)
print(f'\n**** Dati Json *****\n {dizio}')
df = pd.DataFrame(data=dizio)
return df
# just run
crawler(9788807033247)
@marcogoldin
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment