Skip to content

Instantly share code, notes, and snippets.

@sposmen
Forked from Azazeo/crawler.py
Last active August 29, 2015 14:05
Show Gist options
  • Save sposmen/9427f1ae35f4e85b7c84 to your computer and use it in GitHub Desktop.
Save sposmen/9427f1ae35f4e85b7c84 to your computer and use it in GitHub Desktop.
MAX_THREADS = 5
delay = 0.5
import psycopg2
import re
import sys
import time
import threading
import urllib2
import urlparse
from bs4 import BeautifulSoup
start_url = 'http://airbnb.com'
room_regex = re.compile('^http:\/\/airbnb.com\/rooms\/[0-9]*$')
connection = psycopg2.connect(database="urls",
host="100.86.226.62",
port="5432",
user="postgres",
password="pgpassword")
cursor = connection.cursor()
class Worker(threading.Thread):
def __init__(self, n, Q):
self.Q = Q
self.visited = set()
threading.Thread.__init__(self)
self.n = n
self.connection = psycopg2.connect(database="urls",
host="100.86.226.62",
port="5432",
user="postgres",
password="pgpassword")
self.cursor = self.connection.cursor()
def process_url(self, url):
self.visited = self.visited | set([url])
try:
soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url)))
page_urls = soup.findAll('a', href=True)
try:
self.cursor.execute("UPDATE airbnb_urls SET crawled = %s WHERE url = %s;", (True, url))
self.connection.commit()
except:
self.connection.commit()
for tag in page_urls:
new_url = urlparse.urljoin(url, tag['href'])
if start_url in new_url and new_url not in self.visited and '?' not in new_url and '#' not in new_url:
try:
is_room = True if re.match(room_regex, new_url) != None else False
self.cursor.execute("INSERT INTO airbnb_urls (url, crawled, is_room) VALUES (%s, %s, %s)", (new_url, False, is_room))
self.connection.commit()
self.visited = self.visited | set([new_url])
except:
self.connection.commit()
except urllib2.HTTPError as e:
print "something wrong with {}".format(url)
print "{} : {}".format(e.code, e.reason)
def run(self):
u = self.Q.pop()
if u not in self.visited:
self.visited = self.visited | set([u])
self.process_url(u)
while len(self.Q) > 0:
time.sleep(delay)
u = self.Q.pop()
print "Thread #", self.n
print "Poped ", u
print "In queue: ", len(self.Q)
if u not in self.visited:
self.visited = self.visited | set([u])
self.process_url(u)
print "Queue is empty\nFINISHED"
while not False:
cursor.execute("SELECT COUNT (url) FROM airbnb_urls WHERE crawled = FALSE;")
if cursor.fetchone()[0] == 0:
print "Nothing to crawl"
break
Q = []
JOBS = []
for i in xrange(MAX_THREADS):
cursor.execute("SELECT url FROM airbnb_urls WHERE crawled = FALSE LIMIT 1000 OFFSET {};".format(i*1000))
Q = map(lambda x: x[0], cursor.fetchall())
if len(Q) > 0:
j = Worker(i, Q)
JOBS.append(j)
j.start()
for j in JOBS:
j.join()
import psycopg2
import random
import sys
import time
import threading
import urllib2
import urlparse
from datetime import datetime
from bs4 import BeautifulSoup
connection = psycopg2.connect(database="urls",
host="100.86.226.62",
port="5432",
user="postgres",
password="pgpassword")
cursor = connection.cursor()
cursor.execute('SELECT url FROM airbnb_url WHERE is_room = TRUE LIMIT 1 OFFSET 1234;')
room = cursor.fetchone();
print room
price_pattern = {
'day_l': 'price_amount',
'day_h': 'price_amount',
'week_l': 'weekly_price_string',
'week_h': 'weekly_price_string',
'month_l': 'monthly_price_string',
'month_h': 'monthly_price_string',
}
address_pattern = {
'address': 'display-address',
'zip-code': None,
}
def get_data_by_pattern(pattern, html):
data={}
for name, id in pattern.iteritems():
try:
data[name]=html.find(id=id).string
except:
data[name]=None
return data
def get_address_from_airbnb(html):
address = {
'address': None,
'zip-code': None
}
try:
a = html.find(id='display-address')['data-location']
if a[-13:]=='United States':
address['zip-code']=a[-20:-15]
address['address']=a
except:
pass
return address
def get_rating_from_airbnb(html):
rating={'reviews': None,
'rating': None}
try:
rating['reviews']=int(html.find(id='action-buttons').find('a',class_='icon').string)
except:
pass
try:
if rating['reviews']>0:
r=(len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star'))) + 0.5*len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star-half'))))
rating['rating']=r
except:
pass
return rating
def get_accommodates_from_airbnb(html):
r = None
try:
for td in html.find(id='description_details').find_all('td'):
if td.string == 'Accommodates:':
r = int(td.parent.find_all('td')[1].string)
except:
pass
return r
def get_bedrooms_from_airbnb(html):
r = None
try:
for td in html.find(id='description_details').find_all('td'):
if td.string == 'Bedrooms:':
r = int(td.parent.find_all('td')[1].string)
except:
pass
return r
def get_bathrooms_from_airbnb(html):
r = None
try:
for td in html.find(id='description_details').find_all('td'):
if td.string == 'Bathrooms:':
r = int(td.parent.find_all('td')[1].string)
except:
pass
return r
def get_or_create_ppage(room):
ppage=ParsedPage.objects(url=room).first()
if ppage:
ppage.updated=datetime.now()
return ppage
else:
ppage=ParsedPage(url=room, updated=datetime.now())
return ppage
def parse_room(room):
print room
ppage=get_or_create_ppage(room)
try:
soup=BeautifulSoup(urllib2.urlopen(urllib2.Request(room)))
ppage.rating=get_rating_from_airbnb(soup)
ppage.price=get_data_by_pattern(price_pattern, soup)
ppage.address=get_address_from_airbnb(soup)
ppage.rooms=get_rooms_from_airbnb(soup)
except urllib2.HTTPError as e:
print "something wrong with {}".format(room)
print "{} : {}".format(e.code, e.reason)
except:
print 'Some parsing error: {}'.format(sys.exc_info()[0])
return ppage
def save_parsed_room(ppage):
try:
ppage.updated=datetime.now()
ppage.save()
except:
print 'Error on saving {}'.format(ppage.url)
print 'Details: {}'.format(sys.exc_info()[0])
class Worker(threading.Thread):
def __init__(self, n):
threading.Thread.__init__(self)
self.n=n
def run(self):
global urls
while len(urls)>0:
url=urls.pop()
if self.n==0:
print '#####'
print 'urls left: {}'.format(len(urls))
print '#####'
save_parsed_room(parse_room(url))
time.sleep(DELAY)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment