Last active
August 21, 2016 19:01
-
-
Save hmml/4648153 to your computer and use it in GitHub Desktop.
Download meta information about all posts from http://thejoysofcode.com and http://devopsreactions.tumblr.com to JSON files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# | |
# Download meta information from: | |
# - http://thejoysofcode.com | |
# - http://devopsreactions.tumblr.com | |
# | |
# Script downloads title, link and image url for each post. First run will | |
# extract all posts. Following executions just update db. Posts are stored | |
# in JSON format in 'posts-joys.json' and 'posts-devops.json'. | |
# | |
# Requirements: BeautifulSoup4 and lxml | |
import json | |
import gzip | |
from urllib2 import urlopen | |
from bs4 import BeautifulSoup | |
from StringIO import StringIO | |
def getContent(url): | |
content = None | |
response = urlopen(url) | |
if response.info().get('Content-Encoding') == 'gzip': | |
buf = StringIO(response.read()) | |
f = gzip.GzipFile(fileobj=buf) | |
content = ''.join(f.readlines()) | |
else: | |
content = ''.join(response.readlines()) | |
return content | |
def JOYS_extractPostsFromURL(url): | |
posts = [] | |
soup = BeautifulSoup(getContent(url), "lxml") | |
for e in soup.select('article > div'): | |
post = {} | |
try: | |
post['link'] = e.h2.a['href'] | |
post['title'] = ''.join(e.h2.a.contents) | |
post['image'] = e.find('img')['src'] | |
posts.append(post) | |
except: | |
print 'Encountered little problem when scraping %s' % url | |
return posts | |
def DEVOPS_extractPostsFromURL(url): | |
posts = [] | |
soup = BeautifulSoup(getContent(url), "lxml") | |
for e in soup.select('.item_content'): | |
post = {} | |
try: | |
post['link'] = e.div.a['href'] | |
post['title'] = ''.join(e.div.a.contents) | |
post['image'] = e.find('img')['src'] | |
posts.append(post) | |
except: | |
print 'Encountered little problem when scraping %s' % url | |
return posts | |
def updateListOfPosts(posts, url, extractor, limit=None): | |
pageNumber = 0 | |
while True: | |
pageNumber += 1 | |
page = '%s/page/%d' % (url, pageNumber) | |
print 'Extracting posts from: %s ...' % page | |
newPosts = extractor(page) | |
if len(newPosts) == 0: | |
break | |
samePostEncountered = False | |
for post in newPosts: | |
if post in posts: | |
print ' already exists:', post['title'].encode('utf-8') | |
samePostEncountered = True | |
else: | |
posts.append(post) | |
if limit and limit == pageNumber: | |
print 'Limit, break' | |
break | |
if samePostEncountered: | |
print 'Same posts found, assuming up-to-date.' | |
break | |
return posts | |
sites = [{'url': 'http://thejoysofcode.com', | |
'file': 'posts-joys.json', | |
'extractor': JOYS_extractPostsFromURL}, | |
{'url': 'http://devopsreactions.tumblr.com', | |
'file': 'posts-devops.json', | |
'extractor': DEVOPS_extractPostsFromURL}] | |
if __name__ == '__main__': | |
for site in sites: | |
try: | |
posts = json.loads(open(site['file'], 'r').read()) | |
except: | |
posts = [] | |
posts = json.dumps(updateListOfPosts(posts, site['url'], site['extractor']), indent=4, separators=(',', ': ')) | |
open(site['file'], 'w+').write(posts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I´m not into python, i have a tumblr without pagination (infinite scroll), what should i change?