Instantly share code, notes, and snippets.

What would you like to do?
Scrape few pages on gumtree to find a place that match your criteria
import asyncio
import bs4
import requests
import re
import tqdm
from csv import writer
from datetime import datetime
from urllib import parse
FROM_DATE = datetime(2014, 9, 25)
TO_DATE = datetime(2014, 10, 5)
'min_property_number_beds': 2,
'max_property_number_beds': 2,
'min_price': 280,
'max_price': 340,
'photos_filter': 'Y',
'search_location': 'London',
'category': 'flats-and-houses-for-rent-offered'
DEPTH = 150
sem = asyncio.Semaphore(5)
class CSV:
def __init__(self, csvfile):
self.writer = writer(csvfile, delimiter=';')
def write(self, date, pw, location, lat, lon, link, description):
pw ='\d+', pw).group(0)
pcm = float(pw) * 52 / 12 if pw else None
location = location.replace('"', '')
[date, '%.2f' % pcm, pw, location, lat, lon, link, description]
def _get_lat_lon(url):
"""Kinda ugly, but it does the job."""
response = requests.get(url)
content = str(response.content)
ltlng ='ltlng":"(-?\d+.\d+);(-?\d+.\d+)', content)
if ltlng:
return 0, 0
def write_filtered_flats(page, csv):
url = BASE_URL.format(page, parse.urlencode(QUERY_PARAMS))
with (yield from sem):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.content)
for li in'li.offer-sale'):
a ='> a.description')[0]
link = a.attrs['href']
description = a.attrs['title']
pw ='span.price')[0].text
location ='span.location')[0].text
raw_date =
'> div.location-and-date .displayed-date'
date = datetime.strptime(raw_date, '%d/%m/%y')
if date >= FROM_DATE and date <= TO_DATE:
# Get the proper location, if not, avoid them
lat, lon = _get_lat_lon(link)
if lat and lon:
raw_date, pw, location, lat, lon, link, description
def wait_with_progress(coros):
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
yield from f
if __name__ == '__main__':
loop = asyncio.get_event_loop()
with open('/tmp/flats.csv', 'w', newline='') as csvfile:
csv = CSV(csvfile)
f = wait_with_progress(
write_filtered_flats(page, csv)
for page in range(1, DEPTH + 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment