aron-bordin/dmoz.py

## dmoz.py
#! /usr/bin/env python
from utils import *


pending_requests = 0
result = {}


def response_parse(response):
    global pending_requests
    for url in response['selector']['url']:
        # get the url of repositories
        # we count the number of requests using this var
        pending_requests += 1
        # open a new request
        write_line('''
            {
                "type": "selector_request",
                "id": "category",
                "url": "http://www.dmoz.org%s",
                "selector": {
                    "url": {"type": "css", "filter": "div.title-and-desc a::attr('href')"},
                    "title": {"type": "css", "filter": "div.title-and-desc a > div.site-title::text"}
                }
            }
        ''' % url)


def response_category(response):
    global pending_requests
    # this response is no longer pending
    pending_requests -= 1

    for url, title in zip(response['selector']['url'], response['selector']['title']):
        result[title] = url

    # if finished all requests, we can close the spider
    if pending_requests == 0:
        # serialize the extracted data and close the spider
        open('outputs/dmoz_data.json', 'w').write(json.dumps(result))
        write_line('{"type": "close"}')


def main():
    status = parse_json(stdin.readline())

    # we start checking if the channel is ready
    if status['status'] != 'ready':
        raise Exception("There is problem in the communication channel")

    write_line('''
        {
            "type": "spider",
            "name": "dmoz",
            "start_urls": []
        }
    ''')

    write_line('''
        {
            "type": "selector_request",
            "id": "parse",
            "url": "http://www.dmoz.org/Computers/Programming/Languages/Python/",
            "selector": {
                "url": {"type": "css", "filter": "#subcategories-div > section > div > div.cat-item > a::attr('href')"}
            }
        }''')

    while True:
        msg = parse_json(stdin.readline())

        # check the message type
        if msg['type'] == 'exception' or msg['type'] == 'error':
            raise Exception("Something wrong... " + str(msg))

        elif msg['type'] == 'response_selector':
            # we check the id of the incoming response, and call a function to extract
            # the data from each page
            if msg['id'] == 'parse':
                response_parse(msg)
            elif msg['id'] == 'category':
                response_category(msg)


if __name__ == '__main__':
    main()
	#! /usr/bin/env python
	from utils import *


	pending_requests = 0
	result = {}


	def response_parse(response):
	global pending_requests
	for url in response['selector']['url']:
	# get the url of repositories
	# we count the number of requests using this var
	pending_requests += 1
	# open a new request
	write_line('''
	{
	"type": "selector_request",
	"id": "category",
	"url": "http://www.dmoz.org%s",
	"selector": {
	"url": {"type": "css", "filter": "div.title-and-desc a::attr('href')"},
	"title": {"type": "css", "filter": "div.title-and-desc a > div.site-title::text"}
	}
	}
	''' % url)


	def response_category(response):
	global pending_requests
	# this response is no longer pending
	pending_requests -= 1

	for url, title in zip(response['selector']['url'], response['selector']['title']):
	result[title] = url

	# if finished all requests, we can close the spider
	if pending_requests == 0:
	# serialize the extracted data and close the spider
	open('outputs/dmoz_data.json', 'w').write(json.dumps(result))
	write_line('{"type": "close"}')


	def main():
	status = parse_json(stdin.readline())

	# we start checking if the channel is ready
	if status['status'] != 'ready':
	raise Exception("There is problem in the communication channel")

	write_line('''
	{
	"type": "spider",
	"name": "dmoz",
	"start_urls": []
	}
	''')

	write_line('''
	{
	"type": "selector_request",
	"id": "parse",
	"url": "http://www.dmoz.org/Computers/Programming/Languages/Python/",
	"selector": {
	"url": {"type": "css", "filter": "#subcategories-div > section > div > div.cat-item > a::attr('href')"}
	}
	}''')

	while True:
	msg = parse_json(stdin.readline())

	# check the message type
	if msg['type'] == 'exception' or msg['type'] == 'error':
	raise Exception("Something wrong... " + str(msg))

	elif msg['type'] == 'response_selector':
	# we check the id of the incoming response, and call a function to extract
	# the data from each page
	if msg['id'] == 'parse':
	response_parse(msg)
	elif msg['id'] == 'category':
	response_category(msg)


	if __name__ == '__main__':
	main()