Skip to content

Instantly share code, notes, and snippets.

@hmml
Last active August 21, 2016 19:01
Show Gist options
  • Save hmml/4648153 to your computer and use it in GitHub Desktop.
Save hmml/4648153 to your computer and use it in GitHub Desktop.
Download meta information about all posts from http://thejoysofcode.com and http://devopsreactions.tumblr.com to JSON files.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Download meta information from:
# - http://thejoysofcode.com
# - http://devopsreactions.tumblr.com
#
# Script downloads title, link and image url for each post. First run will
# extract all posts. Following executions just update db. Posts are stored
# in JSON format in 'posts-joys.json' and 'posts-devops.json'.
#
# Requirements: BeautifulSoup4 and lxml
import json
import gzip
from urllib2 import urlopen
from bs4 import BeautifulSoup
from StringIO import StringIO
def getContent(url):
content = None
response = urlopen(url)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
content = ''.join(f.readlines())
else:
content = ''.join(response.readlines())
return content
def JOYS_extractPostsFromURL(url):
posts = []
soup = BeautifulSoup(getContent(url), "lxml")
for e in soup.select('article > div'):
post = {}
try:
post['link'] = e.h2.a['href']
post['title'] = ''.join(e.h2.a.contents)
post['image'] = e.find('img')['src']
posts.append(post)
except:
print 'Encountered little problem when scraping %s' % url
return posts
def DEVOPS_extractPostsFromURL(url):
posts = []
soup = BeautifulSoup(getContent(url), "lxml")
for e in soup.select('.item_content'):
post = {}
try:
post['link'] = e.div.a['href']
post['title'] = ''.join(e.div.a.contents)
post['image'] = e.find('img')['src']
posts.append(post)
except:
print 'Encountered little problem when scraping %s' % url
return posts
def updateListOfPosts(posts, url, extractor, limit=None):
pageNumber = 0
while True:
pageNumber += 1
page = '%s/page/%d' % (url, pageNumber)
print 'Extracting posts from: %s ...' % page
newPosts = extractor(page)
if len(newPosts) == 0:
break
samePostEncountered = False
for post in newPosts:
if post in posts:
print ' already exists:', post['title'].encode('utf-8')
samePostEncountered = True
else:
posts.append(post)
if limit and limit == pageNumber:
print 'Limit, break'
break
if samePostEncountered:
print 'Same posts found, assuming up-to-date.'
break
return posts
sites = [{'url': 'http://thejoysofcode.com',
'file': 'posts-joys.json',
'extractor': JOYS_extractPostsFromURL},
{'url': 'http://devopsreactions.tumblr.com',
'file': 'posts-devops.json',
'extractor': DEVOPS_extractPostsFromURL}]
if __name__ == '__main__':
for site in sites:
try:
posts = json.loads(open(site['file'], 'r').read())
except:
posts = []
posts = json.dumps(updateListOfPosts(posts, site['url'], site['extractor']), indent=4, separators=(',', ': '))
open(site['file'], 'w+').write(posts)
@rodrigoacilia
Copy link

rodrigoacilia commented Aug 21, 2016

I´m not into python, i have a tumblr without pagination (infinite scroll), what should i change?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment