sposmen/crawler.py

## crawler.py
MAX_THREADS = 5
delay = 0.5

import psycopg2
import re
import sys
import time
import threading
import urllib2
import urlparse

from bs4 import BeautifulSoup

start_url = 'http://airbnb.com'
room_regex = re.compile('^http:\/\/airbnb.com\/rooms\/[0-9]*$')

connection = psycopg2.connect(database="urls",
    host="100.86.226.62",
    port="5432",
    user="postgres",
    password="pgpassword")
cursor = connection.cursor()


class Worker(threading.Thread):
    def __init__(self, n, Q):
        self.Q = Q
        self.visited = set()
        threading.Thread.__init__(self)
        self.n = n
        self.connection = psycopg2.connect(database="urls",
            host="100.86.226.62",
            port="5432",
            user="postgres",
            password="pgpassword")
        self.cursor = self.connection.cursor()

    def process_url(self, url):
        self.visited = self.visited | set([url])
        try:
            soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url)))
            page_urls = soup.findAll('a', href=True)
            try:
                self.cursor.execute("UPDATE airbnb_urls SET crawled = %s WHERE url = %s;", (True, url))
                self.connection.commit()
            except:
                self.connection.commit()

            for tag in page_urls:
                new_url = urlparse.urljoin(url, tag['href'])
                if start_url in new_url and new_url not in self.visited and '?' not in new_url and '#' not in new_url:
                    try:
                        is_room = True if re.match(room_regex, new_url) != None else False
                        self.cursor.execute("INSERT INTO airbnb_urls (url, crawled, is_room) VALUES (%s, %s, %s)", (new_url, False, is_room))
                        self.connection.commit()
                        self.visited = self.visited | set([new_url])
                    except:
                        self.connection.commit()
        except urllib2.HTTPError as e:
            print "something wrong with {}".format(url)
            print "{} : {}".format(e.code, e.reason)

    def run(self):
        u = self.Q.pop()
        if u not in self.visited:
            self.visited = self.visited | set([u])
            self.process_url(u)
        while len(self.Q) > 0:
            time.sleep(delay)
            u = self.Q.pop()
            print "Thread #", self.n
            print "Poped ", u
            print "In queue: ", len(self.Q)
            if u not in self.visited:
                self.visited = self.visited | set([u])
                self.process_url(u)
        print "Queue is empty\nFINISHED"

while not False:
    cursor.execute("SELECT COUNT (url) FROM airbnb_urls WHERE crawled = FALSE;")
    if cursor.fetchone()[0] == 0:
        print "Nothing to crawl"
        break
    Q = []
    JOBS = []
    for i in xrange(MAX_THREADS):
        cursor.execute("SELECT url FROM airbnb_urls WHERE crawled = FALSE LIMIT 1000 OFFSET {};".format(i*1000))
        Q = map(lambda x: x[0], cursor.fetchall())
        if len(Q) > 0:
            j = Worker(i, Q)
            JOBS.append(j)
            j.start()
    for j in JOBS:
        j.join()

## parser.py
import psycopg2
import random
import sys
import time
import threading
import urllib2
import urlparse


from datetime import datetime
from bs4 import BeautifulSoup

connection = psycopg2.connect(database="urls",
    host="100.86.226.62",
    port="5432",
    user="postgres",
    password="pgpassword")

cursor = connection.cursor()
cursor.execute('SELECT url FROM airbnb_url WHERE is_room = TRUE LIMIT 1 OFFSET 1234;')
room = cursor.fetchone();
print room

price_pattern = {
    'day_l': 'price_amount',
    'day_h': 'price_amount',
    'week_l': 'weekly_price_string',
    'week_h': 'weekly_price_string',
    'month_l': 'monthly_price_string',
    'month_h': 'monthly_price_string',
}

address_pattern = {
    'address': 'display-address',
    'zip-code': None,
    }

def get_data_by_pattern(pattern, html):
    data={}
    for name, id in pattern.iteritems():
        try:
            data[name]=html.find(id=id).string
        except:
            data[name]=None
    return data

def get_address_from_airbnb(html):
    address = {
               'address': None,
               'zip-code': None
               }
    try:
        a = html.find(id='display-address')['data-location']
        if a[-13:]=='United States':
            address['zip-code']=a[-20:-15]
        address['address']=a
    except:
        pass
    return address

def get_rating_from_airbnb(html):
    rating={'reviews': None,
            'rating': None}
    try:
        rating['reviews']=int(html.find(id='action-buttons').find('a',class_='icon').string)
    except:
        pass
    try:
        if rating['reviews']>0:
            r=(len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star'))) + 0.5*len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star-half'))))
            rating['rating']=r
    except:
        pass
    return rating

def get_accommodates_from_airbnb(html):
    r = None
    try:
        for td in html.find(id='description_details').find_all('td'):
            if td.string == 'Accommodates:':
                r = int(td.parent.find_all('td')[1].string)
    except:
        pass
    return r

def get_bedrooms_from_airbnb(html):
    r = None
    try:
        for td in html.find(id='description_details').find_all('td'):
            if td.string == 'Bedrooms:':
                r = int(td.parent.find_all('td')[1].string)
    except:
        pass
    return r

def get_bathrooms_from_airbnb(html):
    r = None
    try:
        for td in html.find(id='description_details').find_all('td'):
            if td.string == 'Bathrooms:':
                r = int(td.parent.find_all('td')[1].string)
    except:
        pass
    return r

def get_or_create_ppage(room):
    ppage=ParsedPage.objects(url=room).first()
    if ppage:
        ppage.updated=datetime.now()
        return ppage
    else:
        ppage=ParsedPage(url=room, updated=datetime.now())
        return ppage


def parse_room(room):
    print room
    ppage=get_or_create_ppage(room)
    try:
        soup=BeautifulSoup(urllib2.urlopen(urllib2.Request(room)))
        ppage.rating=get_rating_from_airbnb(soup)
        ppage.price=get_data_by_pattern(price_pattern, soup)
        ppage.address=get_address_from_airbnb(soup)
        ppage.rooms=get_rooms_from_airbnb(soup)
    except urllib2.HTTPError as e:
        print "something wrong with {}".format(room)
        print "{} : {}".format(e.code, e.reason)
    except:
        print 'Some parsing error: {}'.format(sys.exc_info()[0])
    return ppage

def save_parsed_room(ppage):
    try:
        ppage.updated=datetime.now()
        ppage.save()
    except:
        print 'Error on saving {}'.format(ppage.url)
        print 'Details: {}'.format(sys.exc_info()[0])

class Worker(threading.Thread):
    def __init__(self, n):
        threading.Thread.__init__(self)
        self.n=n

    def run(self):
        global urls
        while len(urls)>0:
            url=urls.pop()
            if self.n==0:
                print '#####'
                print 'urls left: {}'.format(len(urls))
                print '#####'
            save_parsed_room(parse_room(url))
            time.sleep(DELAY)
	MAX_THREADS = 5
	delay = 0.5

	import psycopg2
	import re
	import sys
	import time
	import threading
	import urllib2
	import urlparse

	from bs4 import BeautifulSoup

	start_url = 'http://airbnb.com'
	room_regex = re.compile('^http:\/\/airbnb.com\/rooms\/[0-9]*$')

	connection = psycopg2.connect(database="urls",
	host="100.86.226.62",
	port="5432",
	user="postgres",
	password="pgpassword")
	cursor = connection.cursor()


	class Worker(threading.Thread):
	def __init__(self, n, Q):
	self.Q = Q
	self.visited = set()
	threading.Thread.__init__(self)
	self.n = n
	self.connection = psycopg2.connect(database="urls",
	host="100.86.226.62",
	port="5432",
	user="postgres",
	password="pgpassword")
	self.cursor = self.connection.cursor()

	def process_url(self, url):
	self.visited = self.visited \| set([url])
	try:
	soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(url)))
	page_urls = soup.findAll('a', href=True)
	try:
	self.cursor.execute("UPDATE airbnb_urls SET crawled = %s WHERE url = %s;", (True, url))
	self.connection.commit()
	except:
	self.connection.commit()

	for tag in page_urls:
	new_url = urlparse.urljoin(url, tag['href'])
	if start_url in new_url and new_url not in self.visited and '?' not in new_url and '#' not in new_url:
	try:
	is_room = True if re.match(room_regex, new_url) != None else False
	self.cursor.execute("INSERT INTO airbnb_urls (url, crawled, is_room) VALUES (%s, %s, %s)", (new_url, False, is_room))
	self.connection.commit()
	self.visited = self.visited \| set([new_url])
	except:
	self.connection.commit()
	except urllib2.HTTPError as e:
	print "something wrong with {}".format(url)
	print "{} : {}".format(e.code, e.reason)

	def run(self):
	u = self.Q.pop()
	if u not in self.visited:
	self.visited = self.visited \| set([u])
	self.process_url(u)
	while len(self.Q) > 0:
	time.sleep(delay)
	u = self.Q.pop()
	print "Thread #", self.n
	print "Poped ", u
	print "In queue: ", len(self.Q)
	if u not in self.visited:
	self.visited = self.visited \| set([u])
	self.process_url(u)
	print "Queue is empty\nFINISHED"

	while not False:
	cursor.execute("SELECT COUNT (url) FROM airbnb_urls WHERE crawled = FALSE;")
	if cursor.fetchone()[0] == 0:
	print "Nothing to crawl"
	break
	Q = []
	JOBS = []
	for i in xrange(MAX_THREADS):
	cursor.execute("SELECT url FROM airbnb_urls WHERE crawled = FALSE LIMIT 1000 OFFSET {};".format(i*1000))
	Q = map(lambda x: x[0], cursor.fetchall())
	if len(Q) > 0:
	j = Worker(i, Q)
	JOBS.append(j)
	j.start()
	for j in JOBS:
	j.join()
	import psycopg2
	import random
	import sys
	import time
	import threading
	import urllib2
	import urlparse


	from datetime import datetime
	from bs4 import BeautifulSoup

	connection = psycopg2.connect(database="urls",
	host="100.86.226.62",
	port="5432",
	user="postgres",
	password="pgpassword")

	cursor = connection.cursor()
	cursor.execute('SELECT url FROM airbnb_url WHERE is_room = TRUE LIMIT 1 OFFSET 1234;')
	room = cursor.fetchone();
	print room

	price_pattern = {
	'day_l': 'price_amount',
	'day_h': 'price_amount',
	'week_l': 'weekly_price_string',
	'week_h': 'weekly_price_string',
	'month_l': 'monthly_price_string',
	'month_h': 'monthly_price_string',
	}

	address_pattern = {
	'address': 'display-address',
	'zip-code': None,
	}

	def get_data_by_pattern(pattern, html):
	data={}
	for name, id in pattern.iteritems():
	try:
	data[name]=html.find(id=id).string
	except:
	data[name]=None
	return data

	def get_address_from_airbnb(html):
	address = {
	'address': None,
	'zip-code': None
	}
	try:
	a = html.find(id='display-address')['data-location']
	if a[-13:]=='United States':
	address['zip-code']=a[-20:-15]
	address['address']=a
	except:
	pass
	return address

	def get_rating_from_airbnb(html):
	rating={'reviews': None,
	'rating': None}
	try:
	rating['reviews']=int(html.find(id='action-buttons').find('a',class_='icon').string)
	except:
	pass
	try:
	if rating['reviews']>0:
	r=(len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star'))) + 0.5*len(list(html.find('div', class_='star-rating').find_all(class_='icon icon-pink icon-star-half'))))
	rating['rating']=r
	except:
	pass
	return rating

	def get_accommodates_from_airbnb(html):
	r = None
	try:
	for td in html.find(id='description_details').find_all('td'):
	if td.string == 'Accommodates:':
	r = int(td.parent.find_all('td')[1].string)
	except:
	pass
	return r

	def get_bedrooms_from_airbnb(html):
	r = None
	try:
	for td in html.find(id='description_details').find_all('td'):
	if td.string == 'Bedrooms:':
	r = int(td.parent.find_all('td')[1].string)
	except:
	pass
	return r

	def get_bathrooms_from_airbnb(html):
	r = None
	try:
	for td in html.find(id='description_details').find_all('td'):
	if td.string == 'Bathrooms:':
	r = int(td.parent.find_all('td')[1].string)
	except:
	pass
	return r

	def get_or_create_ppage(room):
	ppage=ParsedPage.objects(url=room).first()
	if ppage:
	ppage.updated=datetime.now()
	return ppage
	else:
	ppage=ParsedPage(url=room, updated=datetime.now())
	return ppage


	def parse_room(room):
	print room
	ppage=get_or_create_ppage(room)
	try:
	soup=BeautifulSoup(urllib2.urlopen(urllib2.Request(room)))
	ppage.rating=get_rating_from_airbnb(soup)
	ppage.price=get_data_by_pattern(price_pattern, soup)
	ppage.address=get_address_from_airbnb(soup)
	ppage.rooms=get_rooms_from_airbnb(soup)
	except urllib2.HTTPError as e:
	print "something wrong with {}".format(room)
	print "{} : {}".format(e.code, e.reason)
	except:
	print 'Some parsing error: {}'.format(sys.exc_info()[0])
	return ppage

	def save_parsed_room(ppage):
	try:
	ppage.updated=datetime.now()
	ppage.save()
	except:
	print 'Error on saving {}'.format(ppage.url)
	print 'Details: {}'.format(sys.exc_info()[0])

	class Worker(threading.Thread):
	def __init__(self, n):
	threading.Thread.__init__(self)
	self.n=n

	def run(self):
	global urls
	while len(urls)>0:
	url=urls.pop()
	if self.n==0:
	print '#####'
	print 'urls left: {}'.format(len(urls))
	print '#####'
	save_parsed_room(parse_room(url))
	time.sleep(DELAY)