Skip to content

Instantly share code, notes, and snippets.

@Azazeo
Created January 13, 2014 09:49
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save Azazeo/8397330 to your computer and use it in GitHub Desktop.
Save Azazeo/8397330 to your computer and use it in GitHub Desktop.
Crawler and data extractor for airbnb.com website
MAX_THREADS = 5
delay = 0.5
import psycopg2
import re
import sys
import time
import threading
import urllib2
import urlparse
from bs4 import BeautifulSoup
start_url = 'http://airbnb.com'
room_regex = re.compile('^http:\/\/airbnb.com\/rooms\/[0-9]*$')
connection = psycopg2.connect(database="urls",
host="100.86.226.62",
port="5432",
user="postgres",
password="pgpassword")
cursor = connection.cursor()
class Worker(threading.Thread):
def __init__(self, n, Q):
self.Q = Q
self.visited = set()
threading.Thread.__init__(self)
self.n = n
self.connection = psycopg2.connect(database="urls",
host="100.86.226.62",
port="5432",
user="postgres",
password="pgpassword")
self.cursor = self.connection.cursor()
def process_url(self, url):
self.visited = self.visited | set([url])
try:
soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url)))
page_urls = soup.findAll('a', href=True)
try:
self.cursor.execute("UPDATE airbnb_urls SET crawled = %s WHERE url = %s;", (True, url))
self.connection.commit()
except:
self.connection.commit()
for tag in page_urls:
new_url = urlparse.urljoin(url, tag['href'])
if start_url in new_url and new_url not in self.visited and '?' not in new_url and '#' not in new_url:
try:
is_room = True if re.match(room_regex, new_url) != None else False
self.cursor.execute("INSERT INTO airbnb_urls (url, crawled, is_room) VALUES (%s, %s, %s)", (new_url, False, is_room))
self.connection.commit()
self.visited = self.visited | set([new_url])
except:
self.connection.commit()
except urllib2.HTTPError as e:
print "something wrong with {}".format(url)
print "{} : {}".format(e.code, e.reason)
def run(self):
u = self.Q.pop()
if u not in self.visited:
self.visited = self.visited | set([u])
self.process_url(u)
while len(self.Q) > 0:
time.sleep(delay)
u = self.Q.pop()
print "Thread #", self.n
print "Poped ", u
print "In queue: ", len(self.Q)
if u not in self.visited:
self.visited = self.visited | set([u])
self.process_url(u)
print "Queue is empty\nFINISHED"
while not False:
cursor.execute("SELECT COUNT (url) FROM airbnb_urls WHERE crawled = FALSE;")
if cursor.fetchone()[0] == 0:
print "Nothing to crawl"
break
Q = []
JOBS = []
for i in xrange(MAX_THREADS):
cursor.execute("SELECT url FROM airbnb_urls WHERE crawled = FALSE LIMIT 1000 OFFSET {};".format(i*1000))
Q = map(lambda x: x[0], cursor.fetchall())
if len(Q) > 0:
j = Worker(i, Q)
JOBS.append(j)
j.start()
for j in JOBS:
j.join()
import psycopg2
import random
import sys
import time
import threading
import urllib2
import urlparse
from datetime import datetime
from bs4 import BeautifulSoup
connection = psycopg2.connect(database="urls",
host="100.86.226.62",
port="5432",
user="postgres",
password="pgpassword")
cursor = connection.cursor()
cursor.execute('SELECT url FROM airbnb_url WHERE is_room = TRUE LIMIT 1 OFFSET 1234;')
room = cursor.fetchone();
print room
price_pattern = {
'day_l': 'price_amount',
'day_h': 'price_amount',
'week_l': 'weekly_price_string',
'week_h': 'weekly_price_string',
'month_l': 'monthly_price_string',
'month_h': 'monthly_price_string',
}
address_pattern = {
'address': 'display-address',
'zip-code': None,
}
def get_data_by_pattern(pattern, html):
data={}
for name, id in pattern.iteritems():
try:
data[name]=html.find(id=id).string
except:
data[name]=None
return data
def get_address_from_airbnb(html):
address = {
'address': None,
'zip-code': None
}
try:
a = html.find(id='display-address')['data-location']
if a[-13:]=='United States':
address['zip-code']=a[-20:-15]
address['address']=a
except:
pass
return address
def get_rating_from_airbnb(html):
rating={'reviews': None,
'rating': None}
try:
rating['reviews']=int(html.find(id='action-buttons').find('a',class_='icon').string)
except:
pass
try:
if rating['reviews']>0:
r=(len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star'))) + 0.5*len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star-half'))))
rating['rating']=r
except:
pass
return rating
def get_accommodates_from_airbnb(html):
r = None
try:
for td in html.find(id='description_details').find_all('td'):
if td.string == 'Accommodates:':
r = int(td.parent.find_all('td')[1].string)
except:
pass
return r
def get_bedrooms_from_airbnb(html):
r = None
try:
for td in html.find(id='description_details').find_all('td'):
if td.string == 'Bedrooms:':
r = int(td.parent.find_all('td')[1].string)
except:
pass
return r
def get_bathrooms_from_airbnb(html):
r = None
try:
for td in html.find(id='description_details').find_all('td'):
if td.string == 'Bathrooms:':
r = int(td.parent.find_all('td')[1].string)
except:
pass
return r
def get_or_create_ppage(room):
ppage=ParsedPage.objects(url=room).first()
if ppage:
ppage.updated=datetime.now()
return ppage
else:
ppage=ParsedPage(url=room, updated=datetime.now())
return ppage
def parse_room(room):
print room
ppage=get_or_create_ppage(room)
try:
soup=BeautifulSoup(urllib2.urlopen(urllib2.Request(room)))
ppage.rating=get_rating_from_airbnb(soup)
ppage.price=get_data_by_pattern(price_pattern, soup)
ppage.address=get_address_from_airbnb(soup)
ppage.rooms=get_rooms_from_airbnb(soup)
except urllib2.HTTPError as e:
print "something wrong with {}".format(room)
print "{} : {}".format(e.code, e.reason)
except:
print 'Some parsing error: {}'.format(sys.exc_info()[0])
return ppage
def save_parsed_room(ppage):
try:
ppage.updated=datetime.now()
ppage.save()
except:
print 'Error on saving {}'.format(ppage.url)
print 'Details: {}'.format(sys.exc_info()[0])
class Worker(threading.Thread):
def __init__(self, n):
threading.Thread.__init__(self)
self.n=n
def run(self):
global urls
while len(urls)>0:
url=urls.pop()
if self.n==0:
print '#####'
print 'urls left: {}'.format(len(urls))
print '#####'
save_parsed_room(parse_room(url))
time.sleep(DELAY)
@mrd1no
Copy link

mrd1no commented May 2, 2015

Hi Azazeo,
nice code!
today I wrote an Airbnb crawler with Scrapy in order to develop a rate shopping tool to compare the development of prices in my area over time.
As long as I was parsing just the general page of the properties everything worked correctly.
When I tried to launch the research for a couple of weeks, after a few seconds I got an Error Server 503.
Do you think that my IP has been kicked out? Did you experience anything similar?

@magusd
Copy link

magusd commented Oct 14, 2015

@mrd1no yeah, I'm having the same problem. It gives me 503 after a few seconds.
I was using python with tornado and connecting on a proxy but to no use.
Have you managed to do it?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment