Created
May 29, 2019 12:21
-
-
Save desulaid/1036fcfa9adece9a2c5c0e126f2fa34c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests import get | |
from lxml import html | |
from openpyxl.workbook import Workbook | |
from openpyxl.styles import Font | |
from re import findall | |
BASE_URL = 'https://kiportal.ru' | |
PAGE_START, PAGE_FINISH = 20, 21 | |
wb = Workbook() | |
sheet = wb.active | |
sheet.title = 'Реестр членов СРО' | |
wb_headers = [ | |
'Фамилия', | |
'Имя', | |
'Отчество', | |
'№ в реестре СРО', | |
'№ Протокола приема', | |
'Дата протокола приема', | |
'№ аттестата КИ', | |
'Дата выдачи аттестата', | |
'Реестровый номер в ГРКИ', | |
'Сведения об образовании', | |
'СНИЛС', | |
'ИНН', | |
'Почтовый адрес для связи с КИ', | |
'Рабочий тел.', | |
'Мобильный тел.', | |
'e-mail для связи' | |
] | |
user_row = 3 | |
for i in range(ord('A'), ord('A') + len(wb_headers)): | |
# sheet[f'{chr(i)}1'] = wb_headers[i-65] | |
# sheet[f'{chr(i)}1'].font = Font(bold=True) | |
pass | |
for page in range(PAGE_START, PAGE_FINISH + 1): | |
# print(f'Сейчас мы на {page} странице') | |
url = f'{BASE_URL}/reestr/reestry-sro/reestr-chlenov/{page}.html' | |
html_doc = get(url).text | |
root = html.fromstring(html_doc) | |
links = [i for i in root.xpath('//tbody//tr//td//a//@href')] | |
user_id = 1 | |
for person in links: | |
# print(f'А сейчас мы чекаем {user_id} человечека') | |
user_id += 1 | |
html_doc = get(f'{BASE_URL}{person}').text | |
root = html.fromstring(html_doc) | |
if root.xpath('//div[@class="element element-text first"]//p//text()') == 'Исключен': | |
continue | |
header = [i[1:] for i in root.xpath('*//div[@id="tab-reestr"]//table//tr//td[1]//text()')] | |
if header == None: | |
continue | |
data = [i for i in root.xpath('*//div[@id="tab-reestr"]//table//tr//td[2]//text()')] | |
name = findall(r'\w+', ''.join(root.xpath('//title//text()'))) | |
# print(data) | |
# table = { | |
# 'Фамилия': name[0], | |
# 'Имя': name[1], | |
# 'Отчество': name[2] | |
# } | |
# for i in range(0, len(header)): | |
# table.update({ | |
# header[i]: data[i] | |
# }) | |
data = name + data[3:8] + data[10:11] | |
index = 0 | |
for info in data: | |
index += 1 | |
sheet.cell(user_row, index, str(info)) | |
user_row += 1 | |
wb.save(f'./{BASE_URL[8:]}.xlsx') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment