PyYoshi/tumblrのアレ

## tumblrのアレ
# -*- coding: utf-8 -*-

from __future__ import generators

__author__ = "Yoshihiro Misawa (remu.biz)"
__version__ = "0.0.1"
__copyright__ = "Copyright (c) 2011 Yoshihiro Misawa"
__license__ = ""

import urllib2
from xml.etree.cElementTree import fromstring
from httplib import HTTPConnection, HTTPException
from urlparse import urlparse
from re import match
from cgi import escape

class TumbCrawler():
    def __init__(self, tid='tid', range=None):
        self.tid = tid
        self.user_info = self.chkUid()
        self.stripper = Stripper()

    """
            指定された文字列からtumblrアカウントが存在するかのチェック
    """
    def chkUid(self):
        arg = self.tid
        uid = None
        flag = 0
        api_domain = None
        info = {}
        try:
            if match('([^"]|^)(https?)(://[\w:;/.?%#&=+-]+)', arg):#urlの場合
                domain = urlparse(arg).netloc
                #print 'url?:' ,domain
            else:#uidまたはそれ以外の場合
                domain = arg + '.tumblr.com'
                #print 'arg?:', domain
            conn = HTTPConnection(domain)
            conn.request("GET", '/')
            r = conn.getresponse()
            if r.status == 200:
                flag = 1 #flag=1はuidが存在する
                uid = r.msg['X-Tumblr-User']
                api_domain = domain
                info['flag'] = flag
                info['uid'] = uid
                info['api_domain'] = api_domain
                #print 'if'
            elif r.status == 301:#ドメイン変更されたIDの場合
                domain2 = urlparse(r.msg['Location']).netloc
                conn2 = HTTPConnection(domain2)
                conn2.request("GET", '/')
                r2 = conn2.getresponse()
                if r2.status == 200:
                    flag = 1
                    uid = r2.msg['X-Tumblr-User']
                    api_domain = urlparse(r.msg['Location']).netloc
                    info['flag'] = flag
                    info['uid'] = uid
                    info['api_domain'] = api_domain
                    #print 'elif'
            else:#idが存在しないか不正な操作が行われた。
                flag = 0
                info['flag'] = flag
                #print 'else'
            conn.close()
        except HTTPException, e:#不適切な操作を行った場合の出力を行う
            flag = 0
            info['flag'] = flag
            #print 'exception', e
        if flag == 1:#アカウントが存在する場合
            #アカウントの投稿数をチェック
            api_url = 'http://' + api_domain + '/api/read?num=1'
            data = urllib2.urlopen(api_url).read()
            elem = fromstring(data)
            totalPosts = elem.find('posts').get('total')
            info['totalPosts'] = totalPosts
            #timezoneのチェック
            timezone = elem.find('tumblelog').get('timezone')
            info['timezone'] = timezone
            #titleのチェック
            title = elem.find('tumblelog').get('title')
            info['tumblr_title'] = title
        return info

    """
    tumblrアカウントの投稿数をチェック

    def addUserInfo(self):
        user_info = self.user_info
        api_domain =  user_info['api_domain']
        api_url = 'http://' + api_domain + '/api/read?num=1'
        data = urllib2.urlopen(api_url).read()
        elem = fromstring(data)
        tNum = elem.find('posts').get('total')
        return int(tNum)"""

    """
            投稿数からAPIURLを生成する(指定された範囲)
    """
    def genAPIurls(self, start, end):
        uid = self.user_info['uid']
        tNum = self.user_info['totalPosts']
        num = 50
        modPosts = (end - start) % num
        repNum = ((end - start) - modPosts) / num
        api_urls= []
        for i in range(repNum+1):
            if i == 0:
                url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals()
                api_urls.append(url)
            elif i == repNum:
                start = end - modPosts
                url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(modPosts)s' % locals()
                api_urls.append(url)
            else:
                start += num
                url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals()
                api_urls.append(url)
        self.api_urls = api_urls
        return api_urls

    """
    APIの読み出しとtypeの振り分け
    """
    def readAPI(self):
        api_urls = self.api_urls
        posts = []
        for i in range(len(api_urls)):
            data = urllib2.urlopen(api_urls[i]).read()
            elem = fromstring(data)
            for n in elem.findall('posts/post'):
                ttype = n.get('type')
                #///Type判別して、各関数へ投げる
                if ttype == 'regular':
                    posts.append(self.text(n, ttype))
                elif ttype =='photo':
                    posts.append(self.photo(n, ttype))
                elif ttype =='quote':
                    posts.append(self.quote(n, ttype))
                elif ttype =='link':
                    posts.append(self.link(n, ttype))
                elif ttype =='conversation':
                    posts.append(self.conversation(n, ttype))
                elif ttype =='audio':
                    posts.append(self.audio(n, ttype))
                elif ttype =='video':
                    posts.append(self.video(n, ttype))
                else:
                    print 'Sorry. This type is unsupported.'
        return posts

    """
    Textタイプのデータ加工
    """
    def text(self, data, ttype):
        stripper = self.stripper
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        regularTitle = data.findtext('.//regular-title/')#regular-title(optional)
        regularBody = data.findtext('.//regular-body/')#regular-body
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "tags": tags,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "regularTitle": stripper.strip(regularTitle),
             "regularBody": escape(regularBody)}
        return post

    """
    Photoタイプのデータ加工
    """
    def photo(self, data, ttype):
        stripper = self.stripper
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        photoLink = data.findtext('.//photo-link-url/')#photo-link-url(optional)
        photoCaption = data.findtext('.//photo-caption/')#photo-caption(optional)
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        for i in data.findall('.//photo-url'):
            size = i.get('max-width')
            if size == '75':
                num = i.text.find('tumblr.com/') + 11
                photoType = i.text[-3:]#photoType jpg, gif, png, bmp
                if len(i.text[num:-9]) < 24:
                    photoChar = None
                    photo_url = i.text.replace("_75sq", "_500")
                else:
                    photoChar = i.text[num:num+24]#photoChar tumblr_xxxxxxxxxxxx
                    photo_url = 'http://%(userID)s.tumblr.com/photo/1280/%(pID)s/1/%(photoChar)s' % locals()
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "tags": tags,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "photoLink": stripper.strip(photoLink),
             "photoCaption": escape(photoCaption),
             "photoChar": photoChar,
             "photoType": photoType,
             "photo_url": photo_url}
        return post

    """
    Quoteタイプのデータ加工
    """
    def quote(self, data, ttype):
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        quoteText = data.findtext('.//quote-text/')#quote-text
        quoteSource = data.findtext('.//quote-source/')#quote-source(optional)
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "tags": tags,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "quoteText": escape(quoteText),
             "quoteSource": escape(quoteSource)}
        return post

    """
    Linkタイプのデータ加工
    """
    def link(self, data, ttype):
        stripper = self.stripper
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        linkTitle = data.findtext('.//link-text/')#link title(optional)
        linkUrl = data.findtext('.//link-url/')#link-url
        linkDescription = data.findtext('.//link-description/')#link-description(optional)
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "tags": tags,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "linkTitle": stripper.strip(linkTitle),
             "linkUrl": stripper.strip(linkUrl),
             "linkDescription": escape(linkDescription)}
        return post

    """
    Chatタイプのデータ加工
    """
    def chat(self, data, ttype):
        stripper = self.stripper
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        conversationTitle = data.findtext('.//conversation-title/')#Conversation title(optional)
        conversation = []
        for z in data.findall('.//conversation'):
            for y in z.getchildren():
                name = y.get('name')
                talk = y.text
                conversation.append({name:talk})#Conversation
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "tags": tags,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "conversationTitle": stripper.strip(conversationTitle),
             "conversation": conversation}
        return post

    """
    Audioタイプのデータ加工
    """
    def audio(self, data, ttype):
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        audioCaption = data.findtext('.//audio-caption/')#audio-caption(optional)
        audioPlayer = data.findtext('.//audio-player/')#audio-player
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "tags": tags,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "audioCaption": escape(audioCaption),
             "audioPlayer": escape(audioPlayer)}
        return post

    """
    Videoタイプのデータ加工
    """
    def video(self, data, ttype):
        userID = self.user_info['uid']
        pID = data.get('id')#post id
        UnixTimestamp = data.get('unix-timestamp')#timestamp
        reblogKey = data.get('reblog-key')#reblog key
        videoCaption = data.findtext('.//video-caption/')#video-caption
        videoPlayer = data.findtext('.//video-player/')#video-player
        tags = []
        for tag in data.findall('.//tag/'):
            if tag.text:
                tags.append(tag.text)
        post = {"userID": userID,
             "pID": pID,
             "ttype": ttype,
             "UnixTimestamp": UnixTimestamp,
             "reblogKey": reblogKey,
             "tags": tags,
             "videoCaption": escape(videoCaption),
             "videoPlayer": escape(videoPlayer)}
        return post

"""
htmlタグの削除
stripper = Stripper()
print stripper.strip("<tag>some boring <a>text</a> goes here</tag>")
"""
import sgmllib
class Stripper(sgmllib.SGMLParser):
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
    def strip(self, some_html):
        self.theString = ""
        self.feed(some_html)
        self.close()
        return self.theString
    def handle_data(self, data):
        self.theString += data
	# -- coding: utf-8 --

	from __future__ import generators

	__author__ = "Yoshihiro Misawa (remu.biz)"
	__version__ = "0.0.1"
	__copyright__ = "Copyright (c) 2011 Yoshihiro Misawa"
	__license__ = ""

	import urllib2
	from xml.etree.cElementTree import fromstring
	from httplib import HTTPConnection, HTTPException
	from urlparse import urlparse
	from re import match
	from cgi import escape

	class TumbCrawler():
	def __init__(self, tid='tid', range=None):
	self.tid = tid
	self.user_info = self.chkUid()
	self.stripper = Stripper()

	"""
	指定された文字列からtumblrアカウントが存在するかのチェック
	"""
	def chkUid(self):
	arg = self.tid
	uid = None
	flag = 0
	api_domain = None
	info = {}
	try:
	if match('([^"]\|^)(https?)(://[\w:;/.?%#&=+-]+)', arg):#urlの場合
	domain = urlparse(arg).netloc
	#print 'url?:' ,domain
	else:#uidまたはそれ以外の場合
	domain = arg + '.tumblr.com'
	#print 'arg?:', domain
	conn = HTTPConnection(domain)
	conn.request("GET", '/')
	r = conn.getresponse()
	if r.status == 200:
	flag = 1 #flag=1はuidが存在する
	uid = r.msg['X-Tumblr-User']
	api_domain = domain
	info['flag'] = flag
	info['uid'] = uid
	info['api_domain'] = api_domain
	#print 'if'
	elif r.status == 301:#ドメイン変更されたIDの場合
	domain2 = urlparse(r.msg['Location']).netloc
	conn2 = HTTPConnection(domain2)
	conn2.request("GET", '/')
	r2 = conn2.getresponse()
	if r2.status == 200:
	flag = 1
	uid = r2.msg['X-Tumblr-User']
	api_domain = urlparse(r.msg['Location']).netloc
	info['flag'] = flag
	info['uid'] = uid
	info['api_domain'] = api_domain
	#print 'elif'
	else:#idが存在しないか不正な操作が行われた。
	flag = 0
	info['flag'] = flag
	#print 'else'
	conn.close()
	except HTTPException, e:#不適切な操作を行った場合の出力を行う
	flag = 0
	info['flag'] = flag
	#print 'exception', e
	if flag == 1:#アカウントが存在する場合
	#アカウントの投稿数をチェック
	api_url = 'http://' + api_domain + '/api/read?num=1'
	data = urllib2.urlopen(api_url).read()
	elem = fromstring(data)
	totalPosts = elem.find('posts').get('total')
	info['totalPosts'] = totalPosts
	#timezoneのチェック
	timezone = elem.find('tumblelog').get('timezone')
	info['timezone'] = timezone
	#titleのチェック
	title = elem.find('tumblelog').get('title')
	info['tumblr_title'] = title
	return info

	"""
	tumblrアカウントの投稿数をチェック

	def addUserInfo(self):
	user_info = self.user_info
	api_domain = user_info['api_domain']
	api_url = 'http://' + api_domain + '/api/read?num=1'
	data = urllib2.urlopen(api_url).read()
	elem = fromstring(data)
	tNum = elem.find('posts').get('total')
	return int(tNum)"""

	"""
	投稿数からAPIURLを生成する(指定された範囲)
	"""
	def genAPIurls(self, start, end):
	uid = self.user_info['uid']
	tNum = self.user_info['totalPosts']
	num = 50
	modPosts = (end - start) % num
	repNum = ((end - start) - modPosts) / num
	api_urls= []
	for i in range(repNum+1):
	if i == 0:
	url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals()
	api_urls.append(url)
	elif i == repNum:
	start = end - modPosts
	url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(modPosts)s' % locals()
	api_urls.append(url)
	else:
	start += num
	url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals()
	api_urls.append(url)
	self.api_urls = api_urls
	return api_urls

	"""
	APIの読み出しとtypeの振り分け
	"""
	def readAPI(self):
	api_urls = self.api_urls
	posts = []
	for i in range(len(api_urls)):
	data = urllib2.urlopen(api_urls[i]).read()
	elem = fromstring(data)
	for n in elem.findall('posts/post'):
	ttype = n.get('type')
	#///Type判別して、各関数へ投げる
	if ttype == 'regular':
	posts.append(self.text(n, ttype))
	elif ttype =='photo':
	posts.append(self.photo(n, ttype))
	elif ttype =='quote':
	posts.append(self.quote(n, ttype))
	elif ttype =='link':
	posts.append(self.link(n, ttype))
	elif ttype =='conversation':
	posts.append(self.conversation(n, ttype))
	elif ttype =='audio':
	posts.append(self.audio(n, ttype))
	elif ttype =='video':
	posts.append(self.video(n, ttype))
	else:
	print 'Sorry. This type is unsupported.'
	return posts

	"""
	Textタイプのデータ加工
	"""
	def text(self, data, ttype):
	stripper = self.stripper
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	regularTitle = data.findtext('.//regular-title/')#regular-title(optional)
	regularBody = data.findtext('.//regular-body/')#regular-body
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"tags": tags,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"regularTitle": stripper.strip(regularTitle),
	"regularBody": escape(regularBody)}
	return post

	"""
	Photoタイプのデータ加工
	"""
	def photo(self, data, ttype):
	stripper = self.stripper
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	photoLink = data.findtext('.//photo-link-url/')#photo-link-url(optional)
	photoCaption = data.findtext('.//photo-caption/')#photo-caption(optional)
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	for i in data.findall('.//photo-url'):
	size = i.get('max-width')
	if size == '75':
	num = i.text.find('tumblr.com/') + 11
	photoType = i.text[-3:]#photoType jpg, gif, png, bmp
	if len(i.text[num:-9]) < 24:
	photoChar = None
	photo_url = i.text.replace("_75sq", "_500")
	else:
	photoChar = i.text[num:num+24]#photoChar tumblr_xxxxxxxxxxxx
	photo_url = 'http://%(userID)s.tumblr.com/photo/1280/%(pID)s/1/%(photoChar)s' % locals()
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"tags": tags,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"photoLink": stripper.strip(photoLink),
	"photoCaption": escape(photoCaption),
	"photoChar": photoChar,
	"photoType": photoType,
	"photo_url": photo_url}
	return post

	"""
	Quoteタイプのデータ加工
	"""
	def quote(self, data, ttype):
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	quoteText = data.findtext('.//quote-text/')#quote-text
	quoteSource = data.findtext('.//quote-source/')#quote-source(optional)
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"tags": tags,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"quoteText": escape(quoteText),
	"quoteSource": escape(quoteSource)}
	return post

	"""
	Linkタイプのデータ加工
	"""
	def link(self, data, ttype):
	stripper = self.stripper
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	linkTitle = data.findtext('.//link-text/')#link title(optional)
	linkUrl = data.findtext('.//link-url/')#link-url
	linkDescription = data.findtext('.//link-description/')#link-description(optional)
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"tags": tags,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"linkTitle": stripper.strip(linkTitle),
	"linkUrl": stripper.strip(linkUrl),
	"linkDescription": escape(linkDescription)}
	return post

	"""
	Chatタイプのデータ加工
	"""
	def chat(self, data, ttype):
	stripper = self.stripper
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	conversationTitle = data.findtext('.//conversation-title/')#Conversation title(optional)
	conversation = []
	for z in data.findall('.//conversation'):
	for y in z.getchildren():
	name = y.get('name')
	talk = y.text
	conversation.append({name:talk})#Conversation
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"tags": tags,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"conversationTitle": stripper.strip(conversationTitle),
	"conversation": conversation}
	return post

	"""
	Audioタイプのデータ加工
	"""
	def audio(self, data, ttype):
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	audioCaption = data.findtext('.//audio-caption/')#audio-caption(optional)
	audioPlayer = data.findtext('.//audio-player/')#audio-player
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"tags": tags,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"audioCaption": escape(audioCaption),
	"audioPlayer": escape(audioPlayer)}
	return post

	"""
	Videoタイプのデータ加工
	"""
	def video(self, data, ttype):
	userID = self.user_info['uid']
	pID = data.get('id')#post id
	UnixTimestamp = data.get('unix-timestamp')#timestamp
	reblogKey = data.get('reblog-key')#reblog key
	videoCaption = data.findtext('.//video-caption/')#video-caption
	videoPlayer = data.findtext('.//video-player/')#video-player
	tags = []
	for tag in data.findall('.//tag/'):
	if tag.text:
	tags.append(tag.text)
	post = {"userID": userID,
	"pID": pID,
	"ttype": ttype,
	"UnixTimestamp": UnixTimestamp,
	"reblogKey": reblogKey,
	"tags": tags,
	"videoCaption": escape(videoCaption),
	"videoPlayer": escape(videoPlayer)}
	return post

	"""
	htmlタグの削除
	stripper = Stripper()
	print stripper.strip("<tag>some boring <a>text</a> goes here</tag>")
	"""
	import sgmllib
	class Stripper(sgmllib.SGMLParser):
	def __init__(self):
	sgmllib.SGMLParser.__init__(self)
	def strip(self, some_html):
	self.theString = ""
	self.feed(some_html)
	self.close()
	return self.theString
	def handle_data(self, data):
	self.theString += data