Skip to content

Instantly share code, notes, and snippets.

@aron-bordin
Created July 27, 2016 15:56
Show Gist options
  • Save aron-bordin/97ca4233b5a304cd1466c5322b358cc6 to your computer and use it in GitHub Desktop.
Save aron-bordin/97ca4233b5a304cd1466c5322b358cc6 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
from utils import *
pending_requests = 0
result = {}
def response_parse(response):
global pending_requests
for url in response['selector']['url']:
# get the url of repositories
# we count the number of requests using this var
pending_requests += 1
# open a new request
write_line('''
{
"type": "selector_request",
"id": "category",
"url": "http://www.dmoz.org%s",
"selector": {
"url": {"type": "css", "filter": "div.title-and-desc a::attr('href')"},
"title": {"type": "css", "filter": "div.title-and-desc a > div.site-title::text"}
}
}
''' % url)
def response_category(response):
global pending_requests
# this response is no longer pending
pending_requests -= 1
for url, title in zip(response['selector']['url'], response['selector']['title']):
result[title] = url
# if finished all requests, we can close the spider
if pending_requests == 0:
# serialize the extracted data and close the spider
open('outputs/dmoz_data.json', 'w').write(json.dumps(result))
write_line('{"type": "close"}')
def main():
status = parse_json(stdin.readline())
# we start checking if the channel is ready
if status['status'] != 'ready':
raise Exception("There is problem in the communication channel")
write_line('''
{
"type": "spider",
"name": "dmoz",
"start_urls": []
}
''')
write_line('''
{
"type": "selector_request",
"id": "parse",
"url": "http://www.dmoz.org/Computers/Programming/Languages/Python/",
"selector": {
"url": {"type": "css", "filter": "#subcategories-div > section > div > div.cat-item > a::attr('href')"}
}
}''')
while True:
msg = parse_json(stdin.readline())
# check the message type
if msg['type'] == 'exception' or msg['type'] == 'error':
raise Exception("Something wrong... " + str(msg))
elif msg['type'] == 'response_selector':
# we check the id of the incoming response, and call a function to extract
# the data from each page
if msg['id'] == 'parse':
response_parse(msg)
elif msg['id'] == 'category':
response_category(msg)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment