Skip to content

Instantly share code, notes, and snippets.

@jjgod
Created February 13, 2012 23:02
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save jjgod/1821303 to your computer and use it in GitHub Desktop.
Save jjgod/1821303 to your computer and use it in GitHub Desktop.
Backup Douban Notes
#!/usr/bin/env python
# Douban Notes Grab
import sys, urllib, re, HTMLParser, time, os
def save(url, title, t):
f = urllib.urlopen(url)
fname = "%s.txt" % title
startstr = '<pre class="note">'
endstr = '</pre></div>'
content = ""
saving = False
if os.path.isfile(fname):
os.utime(fname, (t, t))
return
for line in f:
if saving:
if endstr in line:
saving = False
print "Saving", fname
w = open(fname, "w")
w.write(content)
w.close()
os.utime(fname, (t, t))
return
else:
content += line
else:
start = line.find(startstr)
if start != -1:
content += line[start + len(startstr):]
saving = True
def fetch(url):
h = HTMLParser.HTMLParser()
f = urllib.urlopen(url)
title = None
note_url = None
for line in f:
m = re.match('.*<h3><a class="ll" title="([^"]*)" href="([^"]*)">(.*)</a>', line)
if m:
title = h.unescape(m.group(3).decode("utf-8"))
note_url = m.group(2)
continue
m = re.match('.*<div><span class="pl">([^<]*)</span></div>', line)
if m:
t = time.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
save(note_url, title, int(time.mktime(t)))
continue
m = re.match('.*<link rel="next" href="([^"]*)"/>', line)
if m:
print m.group(1)
return m.group(1)
return None
if len(sys.argv) < 2:
print "usage: %s <id>" % sys.argv[0]
sys.exit(1)
userid = sys.argv[1]
url = "http://www.douban.com/people/%s/notes" % userid
while url != None:
url = fetch(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment