Skip to content

Instantly share code, notes, and snippets.

@dmig
Created March 30, 2019 14:23
Show Gist options
  • Save dmig/8f673f92295d0ee94f5be320418a35c1 to your computer and use it in GitHub Desktop.
Save dmig/8f673f92295d0ee94f5be320418a35c1 to your computer and use it in GitHub Desktop.
avito page parser demo
#! /usr/bin/env python3
import csv
import requests
from bs4 import BeautifulSoup
outfile = open('output.csv', 'w')
csvwriter = csv.writer(outfile)
csvwriter.writerow(['name', 'date', 'address', 'price', 'currency', 'photo', 'url'])
session = requests.Session()
p = 0
next_page = 'https://www.avito.ru/pavlovskaya/doma_dachi_kottedzhi'
while next_page:
p += 1
print('Requesting page', p)
resp = session.get(next_page)
soup = BeautifulSoup(resp.text, features="html5lib")
resp.close()
for item in soup.select('div.item.item_table'):
# photos = item.select('div.item-photo img')
# photo = '\n'.join('https:' + p.attrs.get('src') for p in photos if p.attrs.get('src'))
photo = item.select_one('div.item-photo img')
photo = 'https:' + photo.attrs.get('src') if photo else ''
csvwriter.writerow([
item.select_one('span[itemprop=name]').text,
item.select_one('div[data-absolute-date]').attrs.get('data-absolute-date').strip(),
item.select_one('p.address').text.strip(),
item.select_one('span[itemprop=price]').attrs.get('content'),
item.select_one('span[itemprop=priceCurrency]').attrs.get('content'),
photo,
'https://avito.ru' + item.select_one('a[itemprop=url]').attrs.get('href')
])
next_page = soup.select_one('a.pagination-page.js-pagination-next')
if not next_page:
break
next_page = 'https://www.avito.ru' + next_page.attrs.get('href')
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment