Skip to content

Instantly share code, notes, and snippets.

@mh-github
Created July 27, 2013 13:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mh-github/6094840 to your computer and use it in GitHub Desktop.
Save mh-github/6094840 to your computer and use it in GitHub Desktop.
Fetch blogger posts with Python
#!/usr/bin/python
import urllib2
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
from nltk import clean_html
from HTMLParser import HTMLParser
import feedparser
import os
import sys
from datetime import datetime as dt
import json
def cleanHtml(text):
text = str(text)
try:
return BeautifulStoneSoup(clean_html(text),
convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0]
except IndexError:
return "NO NON_HTML TEXT FOUND"
blog_id = sys.argv[1]
JSON_FEED_URL = 'http://' + blog_id + '.blogspot.com/feeds/posts/default?alt=json'
json_feed = urllib2.urlopen(JSON_FEED_URL)
data = json.load(json_feed)
pages_count = data["feed"]["openSearch$totalResults"]["$t"]
print "Total blog entries = " + pages_count
pages_batch = 500
blog_posts = []
endIndex = (int(pages_count) / pages_batch) + 2
fetch_count = 0
for counter in range(1, endIndex):
FEED_URL = 'http://' + blog_id + '.blogspot.com/feeds/posts/default?start-index=' + str((counter - 1) * pages_batch + 1) + '&max-results=500'
fp = feedparser.parse(FEED_URL)
for e in fp.entries:
link = e.links[4].href
fetch_count = fetch_count + 1
print str(fetch_count) + ' ' + link
page = urllib2.urlopen(link)
soup = BeautifulSoup(page, fromEncoding="utf-8")
content = soup.find('div', attrs={'class' : 'post-body entry-content'})
content = cleanHtml(content)
if content == "NO NON_HTML TEXT FOUND":
print "NO NON_HTML TEXT FOUND"
continue
blog_posts.append({'title': e.title, 'content' : content, 'link': link})
if not os.path.isdir('out'):
os.mkdir('out')
out_file = '%s__%s.json' % (fp.feed.title, dt.utcnow())
f = open(os.path.join(os.getcwd(), 'out', out_file), 'w')
f.write(json.dumps(blog_posts))
f.close()
@tshrinivasan
Copy link

change line no 58 as below
f.write(json.dumps(blog_posts,ensure_ascii=False).encode('utf8'))

to have unicode text in the json file.

Thanks for the nice work.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment