Created
January 17, 2016 14:08
-
-
Save lotabout/46b47d86c18f70e0d466 to your computer and use it in GitHub Desktop.
A crawler for dribbble.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import os | |
import json | |
import logging | |
import traceback | |
logging.basicConfig(level=logging.DEBUG) | |
config = {'output-dir': './output', 'done-file': 'done.txt'} | |
bot = requests.session() | |
def extract_links(sp_main): | |
"""Extract the links to indivisual pages of shots | |
:sp_main: BeautifulSoup parsed result of main page | |
:returns: a list of links | |
""" | |
dribbbles = sp_main.select('div.dribbble'); | |
return [x.find('a', 'dribbble-link')['href'] for x in dribbbles] | |
def extract_name(link): | |
"""Extract name from a link | |
:link: a link like '/parent/filename' or a list of that | |
:returns: the names | |
""" | |
if isinstance(link, list): | |
return [os.path.split(x)[-1] for x in link] | |
elif isinstance(link, str): | |
return os.path.split(link)[-1] | |
else: | |
raise TypeError('String or List type is required') | |
def get_image_page(file_name): | |
"""retrieve the html content of an indivisual name | |
:file_name: the file name of an image | |
:returns: the output of requests' get method | |
""" | |
base_url = 'https://dribbble.com/shots/' | |
return bot.get(url=base_url+file_name) | |
def image_extract_title(soup): | |
"""Extract the actual title of an image | |
:soup: TODO | |
:returns: TODO | |
""" | |
ret = {} | |
title_section = soup.select('#screenshot-title-section')[0] | |
ret['title'] = soup.select('#screenshot-title')[0].text | |
ret['author'] = title_section.select('span.shot-byline-user a')[0].text | |
ret['time'] = title_section.select('span.shot-byline-date a')[0].text | |
return ret; | |
def image_extract_description(soup): | |
"""Extract the description of the image | |
:soup: TODO | |
:returns: TODO | |
""" | |
desc = soup.select('div.shot-desc') | |
return {} if len(desc) <= 0 else {'desc': desc[0].text} | |
def image_extract_tags(soup): | |
"""Extract the actual tags of an image | |
:soup: TODO | |
:returns: a list of tags, every tag in (id, name, url) | |
""" | |
ret = [] | |
tags = soup.select('#tags') | |
if len(tags) <= 1: | |
return {} | |
tags = tags[0] | |
for t in tags.find_all('li'): | |
tag_id = t['id'] | |
a = t.find('a') | |
tag_name = a.text.strip() | |
tag_url = extract_name(a['href']) | |
ret.append((tag_id, tag_name, tag_url)) | |
return {'tags': ret} | |
def image_extract_meta(soup): | |
"""Extract the meta info of image such as likes, views. | |
:soup: TODO | |
:returns: a list of colors | |
""" | |
return {'views': re.findall(r'\d+', soup.select('.views-count')[0].text)[0], | |
'likes': re.findall(r'\d+', soup.select('.likes-count')[0].text)[0] | |
} | |
def image_extract_colors(soup): | |
"""Extract the colors of an image | |
:soup: TODO | |
:returns: a list of colors | |
""" | |
ret = [] | |
colors = soup.select('li.color') | |
for c in colors: | |
ret.append(c.text.strip()) | |
return {'colors': ret} | |
def image_extract_link(soup): | |
"""Extract the actual image link for an image | |
:soup: TODO | |
:returns: the actual image link | |
""" | |
return {'link': soup.select('div.single-img img')[0]['src']} | |
def image_extract_attach(soup): | |
"""Extract the links for attachments | |
:soup: TODO | |
:returns: TODO | |
""" | |
base_url = 'https://dribbble.com' | |
ret = [] | |
links = [x['href'] for x in soup.select('div.attachments a')] | |
for link in links: | |
try: | |
sp_attach = BeautifulSoup(bot.get(url = base_url + link).content, | |
'html.parser') | |
for img in sp_attach.select('#viewer-img img'): | |
ret.append(img['src']) | |
except Exception as e: | |
logging.debug('Error on fetching attachment: ' + link) | |
print(e) | |
traceback.print_exc() | |
pass | |
return {} if len(ret) <=0 else {'attach': ret} | |
def extract_image_info(response): | |
"""Extract the interested information | |
:response: the requestes' response returned by get_image_page | |
:returns: a dict of interested information | |
""" | |
ret = {} | |
soup = BeautifulSoup(response, "html.parser") | |
ret.update(image_extract_title(soup)) | |
ret.update(image_extract_description(soup)) | |
ret.update(image_extract_tags(soup)) | |
ret.update(image_extract_colors(soup)) | |
ret.update(image_extract_link(soup)) | |
ret.update(image_extract_meta(soup)) | |
ret.update(image_extract_attach(soup)) | |
return ret | |
# Dowload | |
def save_response(directory, response): | |
"""Save the response | |
:dir: TODO | |
:response: TODO | |
:returns: TODO | |
""" | |
url = response.url | |
name = os.path.split(url)[-1] | |
with open(os.path.join(directory, name), 'wb') as fp: | |
for chunk in response.iter_content(): | |
fp.write(chunk) | |
def process_one_page(name): | |
"""Download the resource and fetch the information for the file | |
:name: TODO | |
:returns: TODO | |
""" | |
logging.info('Processing for shot: ' + name) | |
filename = os.path.join(config['output-dir'], name) | |
if os.path.exists(filename+'.json'): | |
logging.info('Already Processed: ' + name) | |
return | |
# fetch the information | |
page = get_image_page(name) | |
info = extract_image_info(page.content) | |
info['name'] = name | |
# save the information | |
with open(filename+'.json', 'w') as fp: | |
json.dump(info, fp, indent=4, ensure_ascii=False) | |
# fetch the links and save the corresponding images | |
if not os.path.isdir(filename): | |
os.mkdir(filename) | |
directory = filename | |
print('>>>>>>>> download image link:', info['link']) | |
if 'link' in info: | |
image = bot.get(url = info['link']) | |
save_response(directory, image) | |
print('>>>>>>>> download attachment:', info['attach']) | |
if 'attach' in info: | |
for link in info['attach']: | |
image = bot.get(url = link) | |
save_response(directory, image) | |
def fetch_pages(index_url): | |
"""Fetch the information for index page | |
:index_url: TODO | |
:returns: TODO | |
""" | |
logging.info('Fetching page: ' + index_url) | |
try: | |
sp_ret = BeautifulSoup(bot.get(url=index_url).content, 'html.parser') | |
names = extract_name(extract_links(sp_ret)) | |
for name in names: | |
try: | |
process_one_page(name) | |
except Exception as e: | |
print(e) | |
traceback.print_exc() | |
pass | |
except Exception as e: | |
print(e) | |
traceback.print_exc() | |
pass | |
def run_page(index): | |
"""Fetch information for page index | |
:index: TODO | |
:returns: TODO | |
""" | |
base_url = 'https://dribbble.com/shots?page=' | |
url = base_url + str(index) | |
fetch_pages(url) | |
if not os.path.exists(config['output-dir']): | |
os.mkdir(config['output-dir']) | |
for i in range(1, 10): | |
run_page(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment