Skip to content

Instantly share code, notes, and snippets.

@tananin
Last active August 14, 2021 08:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tananin/f5e298e39af9d317c479a3e5f653e667 to your computer and use it in GitHub Desktop.
Save tananin/f5e298e39af9d317c479a3e5f653e667 to your computer and use it in GitHub Desktop.

Парсин сайтов на Python

Библиотеки

Нам понадобится 3 библиотеки

Вирутальное окружение

Прив создании нового проекта PyCharm сам создаёт и запускает venv виртуальное окружение.

Установка зависимостей

pip install requests beautifulsoup4 lxml
## Парсер wildberries
import bs4
import requests
import logging
import collections
import csv
# Логирование
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('wb')
# Сохранение результатов
ParseResult = collections.namedtuple(
'ParseResult',
(
'brand_name',
'goods_name',
'url',
),
)
HEADERS = (
'Брэнд',
'Товар',
'Ссылка'
)
class Client:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Accept-Language': 'ru'
}
self.result = []
def load_page(self):
url = 'https://www.wildberries.ru/catalog/muzhchinam/odezhda/bryuki-i-shorty'
res = self.session.get(url = url)
res.raise_for_status()
return res.text
def parse_page(self, text: str):
soup = bs4.BeautifulSoup(text, 'lxml')
container = soup.select('div.dtList.i-dtList.j-card-item')
for block in container:
self.parse_block(block = block)
def parse_block(self, block):
# URL
url_block = block.select_one('a.ref_goods_n_p')
if not url_block:
logger.error('no url block')
return
url = 'https://www.wildberries.ru' + url_block.get('href')
if not url:
logger.error('No href in url')
return
# Name
name_block = block.select_one('span.goods-name')
if not name_block:
logger.error('No name on {url}')
return
name = name_block.text.strip()
# Brand
brand_block = block.select_one('strong.brand-name')
if not brand_block:
logger.error('No brand name on {url}')
return
brand = brand_block.text
brand = brand.replace('/', '').strip()
# Save data
self.result.append(ParseResult(
brand_name=brand,
goods_name=name,
url=url
))
logger.debug('%s, %s, %s', url, name, brand)
logger.debug('=' * 100)
def save_result(self):
path = '/Users/maksimtananin/Python/Parser/parser-learn/test.csv'
with open(path, 'w') as f:
writer = csv.writer(f, quoting=csv.QUOTE_ALL, delimiter=';')
writer.writerow(HEADERS)
for item in self.result:
writer.writerow(item)
def run(self):
text = self.load_page()
self.parse_page(text = text)
logger.info(f'Получили {len(self.result)} элементов')
self.save_result()
if __name__ == '__main__':
parser = Client()
parser.run()
## Парсер wildberries
import bs4
import requests
import logging
# Логирование
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('wb')
class Client:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Accept-Language': 'ru'
}
def load_page(self):
url = 'https://www.wildberries.ru/catalog/muzhchinam/odezhda/bryuki-i-shorty'
res = self.session.get(url = url)
res.raise_for_status()
return res.text
def parse_page(self, text: str):
soup = bs4.BeautifulSoup(text, 'lxml')
container = soup.select('div.dtList.i-dtList.j-card-item')
for block in container:
self.parse_block(block = block)
def parse_block(self, block):
logger.info(block)
logger.info('=' * 100)
def run(self):
text = self.load_page()
self.parse_page(text = text)
if __name__ == '__main__':
parser = Client()
parser.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment