Created
May 24, 2011 08:03
-
-
Save PyYoshi/988299 to your computer and use it in GitHub Desktop.
アレです
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import generators | |
__author__ = "Yoshihiro Misawa (remu.biz)" | |
__version__ = "0.0.1" | |
__copyright__ = "Copyright (c) 2011 Yoshihiro Misawa" | |
__license__ = "" | |
import urllib2 | |
from xml.etree.cElementTree import fromstring | |
from httplib import HTTPConnection, HTTPException | |
from urlparse import urlparse | |
from re import match | |
from cgi import escape | |
class TumbCrawler(): | |
def __init__(self, tid='tid', range=None): | |
self.tid = tid | |
self.user_info = self.chkUid() | |
self.stripper = Stripper() | |
""" | |
指定された文字列からtumblrアカウントが存在するかのチェック | |
""" | |
def chkUid(self): | |
arg = self.tid | |
uid = None | |
flag = 0 | |
api_domain = None | |
info = {} | |
try: | |
if match('([^"]|^)(https?)(://[\w:;/.?%#&=+-]+)', arg):#urlの場合 | |
domain = urlparse(arg).netloc | |
#print 'url?:' ,domain | |
else:#uidまたはそれ以外の場合 | |
domain = arg + '.tumblr.com' | |
#print 'arg?:', domain | |
conn = HTTPConnection(domain) | |
conn.request("GET", '/') | |
r = conn.getresponse() | |
if r.status == 200: | |
flag = 1 #flag=1はuidが存在する | |
uid = r.msg['X-Tumblr-User'] | |
api_domain = domain | |
info['flag'] = flag | |
info['uid'] = uid | |
info['api_domain'] = api_domain | |
#print 'if' | |
elif r.status == 301:#ドメイン変更されたIDの場合 | |
domain2 = urlparse(r.msg['Location']).netloc | |
conn2 = HTTPConnection(domain2) | |
conn2.request("GET", '/') | |
r2 = conn2.getresponse() | |
if r2.status == 200: | |
flag = 1 | |
uid = r2.msg['X-Tumblr-User'] | |
api_domain = urlparse(r.msg['Location']).netloc | |
info['flag'] = flag | |
info['uid'] = uid | |
info['api_domain'] = api_domain | |
#print 'elif' | |
else:#idが存在しないか不正な操作が行われた。 | |
flag = 0 | |
info['flag'] = flag | |
#print 'else' | |
conn.close() | |
except HTTPException, e:#不適切な操作を行った場合の出力を行う | |
flag = 0 | |
info['flag'] = flag | |
#print 'exception', e | |
if flag == 1:#アカウントが存在する場合 | |
#アカウントの投稿数をチェック | |
api_url = 'http://' + api_domain + '/api/read?num=1' | |
data = urllib2.urlopen(api_url).read() | |
elem = fromstring(data) | |
totalPosts = elem.find('posts').get('total') | |
info['totalPosts'] = totalPosts | |
#timezoneのチェック | |
timezone = elem.find('tumblelog').get('timezone') | |
info['timezone'] = timezone | |
#titleのチェック | |
title = elem.find('tumblelog').get('title') | |
info['tumblr_title'] = title | |
return info | |
""" | |
tumblrアカウントの投稿数をチェック | |
def addUserInfo(self): | |
user_info = self.user_info | |
api_domain = user_info['api_domain'] | |
api_url = 'http://' + api_domain + '/api/read?num=1' | |
data = urllib2.urlopen(api_url).read() | |
elem = fromstring(data) | |
tNum = elem.find('posts').get('total') | |
return int(tNum)""" | |
""" | |
投稿数からAPIURLを生成する(指定された範囲) | |
""" | |
def genAPIurls(self, start, end): | |
uid = self.user_info['uid'] | |
tNum = self.user_info['totalPosts'] | |
num = 50 | |
modPosts = (end - start) % num | |
repNum = ((end - start) - modPosts) / num | |
api_urls= [] | |
for i in range(repNum+1): | |
if i == 0: | |
url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals() | |
api_urls.append(url) | |
elif i == repNum: | |
start = end - modPosts | |
url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(modPosts)s' % locals() | |
api_urls.append(url) | |
else: | |
start += num | |
url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals() | |
api_urls.append(url) | |
self.api_urls = api_urls | |
return api_urls | |
""" | |
APIの読み出しとtypeの振り分け | |
""" | |
def readAPI(self): | |
api_urls = self.api_urls | |
posts = [] | |
for i in range(len(api_urls)): | |
data = urllib2.urlopen(api_urls[i]).read() | |
elem = fromstring(data) | |
for n in elem.findall('posts/post'): | |
ttype = n.get('type') | |
#///Type判別して、各関数へ投げる | |
if ttype == 'regular': | |
posts.append(self.text(n, ttype)) | |
elif ttype =='photo': | |
posts.append(self.photo(n, ttype)) | |
elif ttype =='quote': | |
posts.append(self.quote(n, ttype)) | |
elif ttype =='link': | |
posts.append(self.link(n, ttype)) | |
elif ttype =='conversation': | |
posts.append(self.conversation(n, ttype)) | |
elif ttype =='audio': | |
posts.append(self.audio(n, ttype)) | |
elif ttype =='video': | |
posts.append(self.video(n, ttype)) | |
else: | |
print 'Sorry. This type is unsupported.' | |
return posts | |
""" | |
Textタイプのデータ加工 | |
""" | |
def text(self, data, ttype): | |
stripper = self.stripper | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
regularTitle = data.findtext('.//regular-title/')#regular-title(optional) | |
regularBody = data.findtext('.//regular-body/')#regular-body | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"tags": tags, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"regularTitle": stripper.strip(regularTitle), | |
"regularBody": escape(regularBody)} | |
return post | |
""" | |
Photoタイプのデータ加工 | |
""" | |
def photo(self, data, ttype): | |
stripper = self.stripper | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
photoLink = data.findtext('.//photo-link-url/')#photo-link-url(optional) | |
photoCaption = data.findtext('.//photo-caption/')#photo-caption(optional) | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
for i in data.findall('.//photo-url'): | |
size = i.get('max-width') | |
if size == '75': | |
num = i.text.find('tumblr.com/') + 11 | |
photoType = i.text[-3:]#photoType jpg, gif, png, bmp | |
if len(i.text[num:-9]) < 24: | |
photoChar = None | |
photo_url = i.text.replace("_75sq", "_500") | |
else: | |
photoChar = i.text[num:num+24]#photoChar tumblr_xxxxxxxxxxxx | |
photo_url = 'http://%(userID)s.tumblr.com/photo/1280/%(pID)s/1/%(photoChar)s' % locals() | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"tags": tags, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"photoLink": stripper.strip(photoLink), | |
"photoCaption": escape(photoCaption), | |
"photoChar": photoChar, | |
"photoType": photoType, | |
"photo_url": photo_url} | |
return post | |
""" | |
Quoteタイプのデータ加工 | |
""" | |
def quote(self, data, ttype): | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
quoteText = data.findtext('.//quote-text/')#quote-text | |
quoteSource = data.findtext('.//quote-source/')#quote-source(optional) | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"tags": tags, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"quoteText": escape(quoteText), | |
"quoteSource": escape(quoteSource)} | |
return post | |
""" | |
Linkタイプのデータ加工 | |
""" | |
def link(self, data, ttype): | |
stripper = self.stripper | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
linkTitle = data.findtext('.//link-text/')#link title(optional) | |
linkUrl = data.findtext('.//link-url/')#link-url | |
linkDescription = data.findtext('.//link-description/')#link-description(optional) | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"tags": tags, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"linkTitle": stripper.strip(linkTitle), | |
"linkUrl": stripper.strip(linkUrl), | |
"linkDescription": escape(linkDescription)} | |
return post | |
""" | |
Chatタイプのデータ加工 | |
""" | |
def chat(self, data, ttype): | |
stripper = self.stripper | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
conversationTitle = data.findtext('.//conversation-title/')#Conversation title(optional) | |
conversation = [] | |
for z in data.findall('.//conversation'): | |
for y in z.getchildren(): | |
name = y.get('name') | |
talk = y.text | |
conversation.append({name:talk})#Conversation | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"tags": tags, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"conversationTitle": stripper.strip(conversationTitle), | |
"conversation": conversation} | |
return post | |
""" | |
Audioタイプのデータ加工 | |
""" | |
def audio(self, data, ttype): | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
audioCaption = data.findtext('.//audio-caption/')#audio-caption(optional) | |
audioPlayer = data.findtext('.//audio-player/')#audio-player | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"tags": tags, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"audioCaption": escape(audioCaption), | |
"audioPlayer": escape(audioPlayer)} | |
return post | |
""" | |
Videoタイプのデータ加工 | |
""" | |
def video(self, data, ttype): | |
userID = self.user_info['uid'] | |
pID = data.get('id')#post id | |
UnixTimestamp = data.get('unix-timestamp')#timestamp | |
reblogKey = data.get('reblog-key')#reblog key | |
videoCaption = data.findtext('.//video-caption/')#video-caption | |
videoPlayer = data.findtext('.//video-player/')#video-player | |
tags = [] | |
for tag in data.findall('.//tag/'): | |
if tag.text: | |
tags.append(tag.text) | |
post = {"userID": userID, | |
"pID": pID, | |
"ttype": ttype, | |
"UnixTimestamp": UnixTimestamp, | |
"reblogKey": reblogKey, | |
"tags": tags, | |
"videoCaption": escape(videoCaption), | |
"videoPlayer": escape(videoPlayer)} | |
return post | |
""" | |
htmlタグの削除 | |
stripper = Stripper() | |
print stripper.strip("<tag>some boring <a>text</a> goes here</tag>") | |
""" | |
import sgmllib | |
class Stripper(sgmllib.SGMLParser): | |
def __init__(self): | |
sgmllib.SGMLParser.__init__(self) | |
def strip(self, some_html): | |
self.theString = "" | |
self.feed(some_html) | |
self.close() | |
return self.theString | |
def handle_data(self, data): | |
self.theString += data | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment