Skip to content

Instantly share code, notes, and snippets.

@lotabout
Created January 17, 2016 14:08
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save lotabout/46b47d86c18f70e0d466 to your computer and use it in GitHub Desktop.
Save lotabout/46b47d86c18f70e0d466 to your computer and use it in GitHub Desktop.
A crawler for dribbble.com
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import os
import json
import logging
import traceback
logging.basicConfig(level=logging.DEBUG)
config = {'output-dir': './output', 'done-file': 'done.txt'}
bot = requests.session()
def extract_links(sp_main):
"""Extract the links to indivisual pages of shots
:sp_main: BeautifulSoup parsed result of main page
:returns: a list of links
"""
dribbbles = sp_main.select('div.dribbble');
return [x.find('a', 'dribbble-link')['href'] for x in dribbbles]
def extract_name(link):
"""Extract name from a link
:link: a link like '/parent/filename' or a list of that
:returns: the names
"""
if isinstance(link, list):
return [os.path.split(x)[-1] for x in link]
elif isinstance(link, str):
return os.path.split(link)[-1]
else:
raise TypeError('String or List type is required')
def get_image_page(file_name):
"""retrieve the html content of an indivisual name
:file_name: the file name of an image
:returns: the output of requests' get method
"""
base_url = 'https://dribbble.com/shots/'
return bot.get(url=base_url+file_name)
def image_extract_title(soup):
"""Extract the actual title of an image
:soup: TODO
:returns: TODO
"""
ret = {}
title_section = soup.select('#screenshot-title-section')[0]
ret['title'] = soup.select('#screenshot-title')[0].text
ret['author'] = title_section.select('span.shot-byline-user a')[0].text
ret['time'] = title_section.select('span.shot-byline-date a')[0].text
return ret;
def image_extract_description(soup):
"""Extract the description of the image
:soup: TODO
:returns: TODO
"""
desc = soup.select('div.shot-desc')
return {} if len(desc) <= 0 else {'desc': desc[0].text}
def image_extract_tags(soup):
"""Extract the actual tags of an image
:soup: TODO
:returns: a list of tags, every tag in (id, name, url)
"""
ret = []
tags = soup.select('#tags')
if len(tags) <= 1:
return {}
tags = tags[0]
for t in tags.find_all('li'):
tag_id = t['id']
a = t.find('a')
tag_name = a.text.strip()
tag_url = extract_name(a['href'])
ret.append((tag_id, tag_name, tag_url))
return {'tags': ret}
def image_extract_meta(soup):
"""Extract the meta info of image such as likes, views.
:soup: TODO
:returns: a list of colors
"""
return {'views': re.findall(r'\d+', soup.select('.views-count')[0].text)[0],
'likes': re.findall(r'\d+', soup.select('.likes-count')[0].text)[0]
}
def image_extract_colors(soup):
"""Extract the colors of an image
:soup: TODO
:returns: a list of colors
"""
ret = []
colors = soup.select('li.color')
for c in colors:
ret.append(c.text.strip())
return {'colors': ret}
def image_extract_link(soup):
"""Extract the actual image link for an image
:soup: TODO
:returns: the actual image link
"""
return {'link': soup.select('div.single-img img')[0]['src']}
def image_extract_attach(soup):
"""Extract the links for attachments
:soup: TODO
:returns: TODO
"""
base_url = 'https://dribbble.com'
ret = []
links = [x['href'] for x in soup.select('div.attachments a')]
for link in links:
try:
sp_attach = BeautifulSoup(bot.get(url = base_url + link).content,
'html.parser')
for img in sp_attach.select('#viewer-img img'):
ret.append(img['src'])
except Exception as e:
logging.debug('Error on fetching attachment: ' + link)
print(e)
traceback.print_exc()
pass
return {} if len(ret) <=0 else {'attach': ret}
def extract_image_info(response):
"""Extract the interested information
:response: the requestes' response returned by get_image_page
:returns: a dict of interested information
"""
ret = {}
soup = BeautifulSoup(response, "html.parser")
ret.update(image_extract_title(soup))
ret.update(image_extract_description(soup))
ret.update(image_extract_tags(soup))
ret.update(image_extract_colors(soup))
ret.update(image_extract_link(soup))
ret.update(image_extract_meta(soup))
ret.update(image_extract_attach(soup))
return ret
# Dowload
def save_response(directory, response):
"""Save the response
:dir: TODO
:response: TODO
:returns: TODO
"""
url = response.url
name = os.path.split(url)[-1]
with open(os.path.join(directory, name), 'wb') as fp:
for chunk in response.iter_content():
fp.write(chunk)
def process_one_page(name):
"""Download the resource and fetch the information for the file
:name: TODO
:returns: TODO
"""
logging.info('Processing for shot: ' + name)
filename = os.path.join(config['output-dir'], name)
if os.path.exists(filename+'.json'):
logging.info('Already Processed: ' + name)
return
# fetch the information
page = get_image_page(name)
info = extract_image_info(page.content)
info['name'] = name
# save the information
with open(filename+'.json', 'w') as fp:
json.dump(info, fp, indent=4, ensure_ascii=False)
# fetch the links and save the corresponding images
if not os.path.isdir(filename):
os.mkdir(filename)
directory = filename
print('>>>>>>>> download image link:', info['link'])
if 'link' in info:
image = bot.get(url = info['link'])
save_response(directory, image)
print('>>>>>>>> download attachment:', info['attach'])
if 'attach' in info:
for link in info['attach']:
image = bot.get(url = link)
save_response(directory, image)
def fetch_pages(index_url):
"""Fetch the information for index page
:index_url: TODO
:returns: TODO
"""
logging.info('Fetching page: ' + index_url)
try:
sp_ret = BeautifulSoup(bot.get(url=index_url).content, 'html.parser')
names = extract_name(extract_links(sp_ret))
for name in names:
try:
process_one_page(name)
except Exception as e:
print(e)
traceback.print_exc()
pass
except Exception as e:
print(e)
traceback.print_exc()
pass
def run_page(index):
"""Fetch information for page index
:index: TODO
:returns: TODO
"""
base_url = 'https://dribbble.com/shots?page='
url = base_url + str(index)
fetch_pages(url)
if not os.path.exists(config['output-dir']):
os.mkdir(config['output-dir'])
for i in range(1, 10):
run_page(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment