-
-
Save mr-linch/ea6803f8df5d3805464a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
import csv | |
import urllib.request | |
from bs4 import BeautifulSoup | |
BASE_URL = 'http://www.weblancer.net/projects/' | |
def get_html(url): | |
response = urllib.request.urlopen(url) | |
return response.read() | |
def get_page_count(html): | |
soup = BeautifulSoup(html) | |
paggination = soup.find('div', class_='pages_list text_box') | |
return int(paggination.find_all('a')[-2].text) | |
def parse(html): | |
soup = BeautifulSoup(html) | |
table = soup.find('table', class_='items_list') | |
rows = table.find_all('tr')[1:] | |
projects = [] | |
for row in rows: | |
cols = row.find_all('td') | |
projects.append({ | |
'title': cols[0].a.text, | |
'categories': [category.text for category in cols[0].find_all('noindex')], | |
'price': cols[1].text.strip().split()[0], | |
'application': cols[2].text.split()[0] | |
}) | |
return projects | |
def save(projects, path): | |
with open(path, 'w') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(('Проект', 'Категории', 'Цена', 'Заявки')) | |
writer.writerows( | |
(project['title'], ', '.join(project['categories']), project['price'], project['application']) for project in projects | |
) | |
def main(): | |
total_pages = get_page_count(get_html(BASE_URL)) | |
print('Всего найдено %d страниц...' % total_pages) | |
projects = [] | |
for page in range(1, total_pages + 1): | |
print('Парсинг %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages)) | |
projects.extend(parse(get_html(BASE_URL + "page=%d" % page))) | |
print('Сохранение...') | |
save(projects, 'projects.csv') | |
if __name__ == '__main__': | |
main() |
Подскажите, пожалуйста, в чем причина ошибки
weblancer.py:18: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 18 of the file weblancer.py. To get rid of this warning, pass the additional argument 'features="html.parser"' to the BeautifulSoup constructor.
soup = BeautifulSoup(html)
Traceback (most recent call last):
File "weblancer.py", line 68, in
main()
File "weblancer.py", line 53, in main
total_pages = get_page_count(get_html(BASE_URL))
File "weblancer.py", line 20, in get_page_count
return int(paggination.find_all('a')[-2].text)
AttributeError: 'NoneType' object has no attribute 'find_all'
@llirikkcoder, если еще актуально, то ошибка вываливается из-за того, что сайт перешел на другую структуру html-кода. Этот парсер пытается найти блоки с несуществующими классами и т.д.
Ниже мой вариант рабочего парсера:
#!/usr/bin/env python3
import csv
import requests
from bs4 import BeautifulSoup
BASE_URL = 'https://www.weblancer.net/jobs/'
def get_html(url):
response = requests.get(url)
if response.status_code == 200:
return response.content
else:
print ("Error")
quit()
def get_page_count(html):
soup = BeautifulSoup(html, features="lxml")
paggination = soup.find('div', class_='pagination_box')
return int(paggination.find_all('a')[-1]['href'].split("=")[1])
def parse(html):
soup = BeautifulSoup(html, features="lxml")
table = soup.find('div', class_='page_content').find('div', class_='cols_table')
rows = table.find_all('div', class_='row')
projects = []
nmbr = 1
for row in rows:
# Знак валюты в конец строки
price = row.find('div', class_='float-right float-sm-none title amount indent-xs-b0').text
if price != '':
price=price[1:]+price[0]
projects.append({
'title': row.find('h2').text,
'short_description': row.find('p').text,
'categories': [category.text for category in row.find('div', class_='col-sm-8 text-muted dot_divided').find_all('a')],
'price': price,
'application': row.find('div', class_='float-left float-sm-none text_field').text.strip()
})
return projects
def save(projects, path):
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('Проект', 'Краткое описание', 'Категории', 'Цена', 'Заявки'))
writer.writerows(
(project['title'], project['short_description'], project['categories'], project['price'], project['application']) for project in projects
)
def main():
total_pages = get_page_count(get_html(BASE_URL))
print('Всего найдено %d страниц...' % total_pages)
projects = []
for page in range(1, total_pages + 1):
print('Парсинг %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
projects.extend(parse(get_html(BASE_URL + "?page=%d" % page)))
print('Сохранение...')
save(projects, 'projects.csv')
print('Готово!')
if __name__ == '__main__':
main()
Не могу разобраться с записью в файл, делал для другого сайта правда, но не суть. У меня получилось 3 категории, и при записи в csv, не идет разбиение на столбцы, все категории записываются без соблюдения полей, просто через запятую. Подскажите пожалуйста, в чем может быть ошибка?
def save(self, path):
projects = self.parsing()
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('Порода', 'Цена', 'Время'))