Skip to content

Instantly share code, notes, and snippets.

@poochin
Created May 1, 2011 13:15
Show Gist options
  • Save poochin/950489 to your computer and use it in GitHub Desktop.
Save poochin/950489 to your computer and use it in GitHub Desktop.
Tumblr Photos
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import sys, os, getpass
import cookielib, urllib, urllib2
from optparse import OptionParser
from sqlalchemy import create_engine, Table, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.expression import desc
from sqlalchemy.orm import relationship, backref
from sqlalchemy.orm import sessionmaker
from sqlalchemy import ForeignKey
from BeautifulSoup import BeautifulSoup
# define
urls = {
'login': 'https://www.tumblr.com/login',
'photos': 'http://www.tumblr.com/photos/page/%d',
}
home = os.environ['HOME']
dbpath = '%s/%s' % (home, '.tumblephotos.db')
'''
DataBase
users
id: integer(primary_key)
email: string
backref: dashboard to users
dashboard
postid: integer(primary_key)
user_id: integer(foreignkey=users.id)
key: string
tumblelog_url: string
photo_url: string
thumbnail_url: string
'''
# データベースを開く
engine = create_engine("sqlite:///%s" % (dbpath), echo=False)
Base = declarative_base(engine)
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
email = Column(String)
posts = relationship('Dashboard', backref='users')
def __init__(self, email):
self.email = email
def __repr__(self):
return "<User('%s')>" % (self.email)
class RecentlyPost(Base):
__tablename__ = 'recentlyposts'
postid =
userid = Column(Integer, ForeignKey('users.id'))
post = relationship('Dashboard', backref='uniqueposts')
class Dashboard(Base):
__tablename__ = 'dashboard'
postid = Column(Integer, primary_key=True)
key = Column(String)
tumblelog_url = Column(String)
photo_url = Column(String)
thumbnail_url = Column(String)
userid = Column(Integer, ForeignKey('users.id'))
def __init__(self, postid, key, tumblelog_url, photo_url, thumbnail_url):
self.postid = postid
self.key = key
self.tumblelog_url = tumblelog_url
self.photo_url = photo_url
self.thumbnail_url = thumbnail_url
def __repr__(self):
return "<Dashboard('%s','%s', '%s')>" % \
(self.postid, self.key, self.photo_url)
Base.metadata.create_all(engine)
session = sessionmaker(bind=engine)()
def js2post(photo_js):
tumblelog_url = re.search('(?<=tumblelog_url: \')[^\']+', photo_js).group()
postid = re.search("(?<=photos\[)\d+", photo_js).group()
key = re.search('(?<=key: \')\w+', photo_js).group()
photo_url = re.search('(?<=photo_url: \')[^\']+', photo_js).group()
thumbnail_url = re.sub('(?<=_)\d+(?=\.\w+$)', '75sq', photo_url)
return Dashboard(postid, key, tumblelog_url, photo_url, thumbnail_url)
def parse_html(html):
soup = BeautifulSoup(html)
script = soup.findAll('script')[-1]
reg = re.compile(u'photos\[\d+\] = {[^}]+}', re.M)
photos_js = reg.findall(str(script))
return [js2post(p) for p in photos_js]
def getto(email, password, dstid):
user = User(email)
if not user in session:
session.add(user)
else:
user = session.query(User).filter(User.email == email).first()
# Login
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
senddata = {'email': email, 'password': password}
r = opener.open(urls['login'], urllib.urlencode(senddata))
if '.tumblr.com' in cj._cookies:
if not int(cj._cookies['.tumblr.com']['/']['logged_in'].value) == 1:
return False
found = False
for page in range(1, 100):
html = opener.open(urls['photos'] % page).read()
posts = parse_html(html)
for post in posts:
if int(post.postid) <= dstid:
found = True
break
user.posts.append(post)
if found:
break
session.commit()
def putraw(posts):
for post in posts:
print post
def puthtml(posts):
print '<html>'
print '<head>'
print '<link rel="stylesheet" type="text/css"'\
'href="http://assets.tumblr.com/stylesheets/photos.css?8"/>'
print '</head>'
print '<body>'
print '<div id="container">'
for post in posts:
print '<a href="http://www.tumblr.com/reblog/%s/%s">'\
'<img class="thumbnail" src="%s"></a>'\
% (post.postid, post.key, post.thumbnail_url)
print '</div>'
print '</body>'
print '</html>'
def putfrom(email, dstid, html=False, overlap=False):
user = session.query(User).filter(User.email == email).first()
posts = session.query(Dashboard).filter(Dashboard.userid == user.id).\
filter(Dashboard.postid >= dstid).\
order_by(desc(Dashboard.postid)).all()
if overlap:
'''
ここで重複を省く処理をしようか SQL 側で解決しようか悩んでいる所
'''
if html:
puthtml(posts)
else:
putraw(posts)
def main():
parser = OptionParser(usage='usage: %prog dest_id [options]')
parser.add_option('-u', dest='email', help='Login email address')
parser.add_option('-o', dest='output', action='store_true',
help='Output saved posts.')
parser.add_option('--html', dest='html', action='store_true',
help='HTML output')
parser.add_option('--overlap', dest='overlap', action='store_true',
help='Without overlaped posts')
parser.set_defaults(html=False)
parser.set_defaults(overlap=False)
(options, args) = parser.parse_args()
if len(args) != 1:
parser.parse_args(['-h'])
destid = int(args[0])
email = options.email
if email == None:
email = raw_input('Input email: ')
if options.output:
putfrom(email, destid, html=options.html, overlap=options.overlap)
else:
password = getpass.getpass('Input pass: ')
getto(email, password, destid)
putfrom(email, 0)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment