Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Link grabber from Siemens Siprotec download page
#!/usr/bin/env python3
# Загружаем все что можно загрузить с сайта Siemens Siprotec
#
import time
from selenium import webdriver
import win32com.client as win32
driver = webdriver.Firefox()
print("Ожидаем загрузки страницы")
driver.get("http://w3.siemens.com/smartgrid/global/en/products-systems-solutions/Protection/siprotec4/Pages/overview.aspx")
time.sleep(2)
print("Открываем вкладку Downloads")
downloads_tab = driver.find_element_by_xpath("//a/span[contains(text(),'Downloads')]/..")
downloads_tab.click()
time.sleep(2)
print("Создаем Excel лист для результатов")
excel = win32.gencache.EnsureDispatch("Excel.Application")
excel.Visible = True
wb = excel.Workbooks.Add()
global sheet
global currow
sheet = wb.Worksheets.Add()
currow = 1
sheet.Cells(currow,1).Value = "Раздел"
sheet.Cells(currow,2).Value = "Подраздел"
sheet.Cells(currow,3).Value = "Под.подраздел"
sheet.Cells(currow,4).Value = "Заголовок"
sheet.Cells(currow,5).Value = "Язык"
sheet.Cells(currow,6).Value = "Дата"
sheet.Cells(currow,7).Value = "Ссылка"
currow = currow + 1
def ParseLinksFromBlock(block):
items_info = block.find_elements_by_xpath(".//tr[contains(@class,'iteminfo')]")
for item_info in items_info:
link = item_info.find_element_by_xpath("td[1]/ul/li/a").get_attribute("href") # ссылка на материал
lang = item_info.find_element_by_xpath("td[2]").text # язык
title = item_info.find_element_by_xpath("td[3]").text # заголовок материала
date = item_info.find_element_by_xpath("td[4]").text # дата в формате месяц / число / год
# получаем путь для сохранения
try:
sub3_path = item_info.find_element_by_xpath("../../../../div/h2").text
sub3_path = sub3_path.replace('', '')
sub3_path = sub3_path.replace('', '')
except:
sub3_path = ""
try:
sub2_path = item_info.find_element_by_xpath("../../../../../../div/h2").text
except:
sub2_path = ""
try:
sub1_path = item_info.find_element_by_xpath("../../../../../../../../../div/h2").text
except:
sub1_path = ""
global sheet
global currow
sheet.Cells(currow,1).Value = sub1_path
sheet.Cells(currow,2).Value = sub2_path
sheet.Cells(currow,3).Value = sub3_path
sheet.Cells(currow,4).Value = title
sheet.Cells(currow,5).Value = lang
sheet.Cells(currow,6).Value = date
sheet.Cells(currow,7).Value = link
currow = currow + 1
first_nodes = driver.find_elements_by_xpath(".//div[contains(@class,'section')]/div/div[contains(@class,'c-teaser')]")
for first_node in first_nodes:
try:
first_node.click()
time.sleep(5)
except:
print("Ошибка при попытке раскрыть узел " + first_node.text)
break
second_nodes = first_node.find_elements_by_xpath(".//div[contains(@class,'c-teaser')]")
for second_node in second_nodes:
try:
second_node.click()
time.sleep(2)
except:
print("Ошибка при попытке раскрыть узел " + second_node.text)
break
try:
additional_node = second_node.find_element_by_xpath(".//div[@class='categoryLessBody']")
ParseLinksFromBlock(additional_node)
except:
print("Нету таблички с материалами для обработки, идем дальше...")
third_nodes = second_node.find_elements_by_xpath(".//div[contains(@class,'categoryHead')]/..")
for third_node in third_nodes:
try:
third_node.click()
time.sleep(3)
except:
print("Ошибка при попытке раскрыть узел " + third_node.text)
break
ParseLinksFromBlock(third_node)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.