Created
May 1, 2011 13:15
-
-
Save poochin/950489 to your computer and use it in GitHub Desktop.
Tumblr Photos
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys, os, getpass | |
import cookielib, urllib, urllib2 | |
from optparse import OptionParser | |
from sqlalchemy import create_engine, Table, Column, Integer, String | |
from sqlalchemy.ext.declarative import declarative_base | |
from sqlalchemy.sql.expression import desc | |
from sqlalchemy.orm import relationship, backref | |
from sqlalchemy.orm import sessionmaker | |
from sqlalchemy import ForeignKey | |
from BeautifulSoup import BeautifulSoup | |
# define | |
urls = { | |
'login': 'https://www.tumblr.com/login', | |
'photos': 'http://www.tumblr.com/photos/page/%d', | |
} | |
home = os.environ['HOME'] | |
dbpath = '%s/%s' % (home, '.tumblephotos.db') | |
''' | |
DataBase | |
users | |
id: integer(primary_key) | |
email: string | |
backref: dashboard to users | |
dashboard | |
postid: integer(primary_key) | |
user_id: integer(foreignkey=users.id) | |
key: string | |
tumblelog_url: string | |
photo_url: string | |
thumbnail_url: string | |
''' | |
# データベースを開く | |
engine = create_engine("sqlite:///%s" % (dbpath), echo=False) | |
Base = declarative_base(engine) | |
class User(Base): | |
__tablename__ = 'users' | |
id = Column(Integer, primary_key=True) | |
email = Column(String) | |
posts = relationship('Dashboard', backref='users') | |
def __init__(self, email): | |
self.email = email | |
def __repr__(self): | |
return "<User('%s')>" % (self.email) | |
class RecentlyPost(Base): | |
__tablename__ = 'recentlyposts' | |
postid = | |
userid = Column(Integer, ForeignKey('users.id')) | |
post = relationship('Dashboard', backref='uniqueposts') | |
class Dashboard(Base): | |
__tablename__ = 'dashboard' | |
postid = Column(Integer, primary_key=True) | |
key = Column(String) | |
tumblelog_url = Column(String) | |
photo_url = Column(String) | |
thumbnail_url = Column(String) | |
userid = Column(Integer, ForeignKey('users.id')) | |
def __init__(self, postid, key, tumblelog_url, photo_url, thumbnail_url): | |
self.postid = postid | |
self.key = key | |
self.tumblelog_url = tumblelog_url | |
self.photo_url = photo_url | |
self.thumbnail_url = thumbnail_url | |
def __repr__(self): | |
return "<Dashboard('%s','%s', '%s')>" % \ | |
(self.postid, self.key, self.photo_url) | |
Base.metadata.create_all(engine) | |
session = sessionmaker(bind=engine)() | |
def js2post(photo_js): | |
tumblelog_url = re.search('(?<=tumblelog_url: \')[^\']+', photo_js).group() | |
postid = re.search("(?<=photos\[)\d+", photo_js).group() | |
key = re.search('(?<=key: \')\w+', photo_js).group() | |
photo_url = re.search('(?<=photo_url: \')[^\']+', photo_js).group() | |
thumbnail_url = re.sub('(?<=_)\d+(?=\.\w+$)', '75sq', photo_url) | |
return Dashboard(postid, key, tumblelog_url, photo_url, thumbnail_url) | |
def parse_html(html): | |
soup = BeautifulSoup(html) | |
script = soup.findAll('script')[-1] | |
reg = re.compile(u'photos\[\d+\] = {[^}]+}', re.M) | |
photos_js = reg.findall(str(script)) | |
return [js2post(p) for p in photos_js] | |
def getto(email, password, dstid): | |
user = User(email) | |
if not user in session: | |
session.add(user) | |
else: | |
user = session.query(User).filter(User.email == email).first() | |
# Login | |
cj = cookielib.CookieJar() | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
senddata = {'email': email, 'password': password} | |
r = opener.open(urls['login'], urllib.urlencode(senddata)) | |
if '.tumblr.com' in cj._cookies: | |
if not int(cj._cookies['.tumblr.com']['/']['logged_in'].value) == 1: | |
return False | |
found = False | |
for page in range(1, 100): | |
html = opener.open(urls['photos'] % page).read() | |
posts = parse_html(html) | |
for post in posts: | |
if int(post.postid) <= dstid: | |
found = True | |
break | |
user.posts.append(post) | |
if found: | |
break | |
session.commit() | |
def putraw(posts): | |
for post in posts: | |
print post | |
def puthtml(posts): | |
print '<html>' | |
print '<head>' | |
print '<link rel="stylesheet" type="text/css"'\ | |
'href="http://assets.tumblr.com/stylesheets/photos.css?8"/>' | |
print '</head>' | |
print '<body>' | |
print '<div id="container">' | |
for post in posts: | |
print '<a href="http://www.tumblr.com/reblog/%s/%s">'\ | |
'<img class="thumbnail" src="%s"></a>'\ | |
% (post.postid, post.key, post.thumbnail_url) | |
print '</div>' | |
print '</body>' | |
print '</html>' | |
def putfrom(email, dstid, html=False, overlap=False): | |
user = session.query(User).filter(User.email == email).first() | |
posts = session.query(Dashboard).filter(Dashboard.userid == user.id).\ | |
filter(Dashboard.postid >= dstid).\ | |
order_by(desc(Dashboard.postid)).all() | |
if overlap: | |
''' | |
ここで重複を省く処理をしようか SQL 側で解決しようか悩んでいる所 | |
''' | |
if html: | |
puthtml(posts) | |
else: | |
putraw(posts) | |
def main(): | |
parser = OptionParser(usage='usage: %prog dest_id [options]') | |
parser.add_option('-u', dest='email', help='Login email address') | |
parser.add_option('-o', dest='output', action='store_true', | |
help='Output saved posts.') | |
parser.add_option('--html', dest='html', action='store_true', | |
help='HTML output') | |
parser.add_option('--overlap', dest='overlap', action='store_true', | |
help='Without overlaped posts') | |
parser.set_defaults(html=False) | |
parser.set_defaults(overlap=False) | |
(options, args) = parser.parse_args() | |
if len(args) != 1: | |
parser.parse_args(['-h']) | |
destid = int(args[0]) | |
email = options.email | |
if email == None: | |
email = raw_input('Input email: ') | |
if options.output: | |
putfrom(email, destid, html=options.html, overlap=options.overlap) | |
else: | |
password = getpass.getpass('Input pass: ') | |
getto(email, password, destid) | |
putfrom(email, 0) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment