Skip to content

Instantly share code, notes, and snippets.

@PyYoshi
Created May 24, 2011 08:03
Show Gist options
  • Save PyYoshi/988299 to your computer and use it in GitHub Desktop.
Save PyYoshi/988299 to your computer and use it in GitHub Desktop.
アレです
# -*- coding: utf-8 -*-
from __future__ import generators
__author__ = "Yoshihiro Misawa (remu.biz)"
__version__ = "0.0.1"
__copyright__ = "Copyright (c) 2011 Yoshihiro Misawa"
__license__ = ""
import urllib2
from xml.etree.cElementTree import fromstring
from httplib import HTTPConnection, HTTPException
from urlparse import urlparse
from re import match
from cgi import escape
class TumbCrawler():
def __init__(self, tid='tid', range=None):
self.tid = tid
self.user_info = self.chkUid()
self.stripper = Stripper()
"""
指定された文字列からtumblrアカウントが存在するかのチェック
"""
def chkUid(self):
arg = self.tid
uid = None
flag = 0
api_domain = None
info = {}
try:
if match('([^"]|^)(https?)(://[\w:;/.?%#&=+-]+)', arg):#urlの場合
domain = urlparse(arg).netloc
#print 'url?:' ,domain
else:#uidまたはそれ以外の場合
domain = arg + '.tumblr.com'
#print 'arg?:', domain
conn = HTTPConnection(domain)
conn.request("GET", '/')
r = conn.getresponse()
if r.status == 200:
flag = 1 #flag=1はuidが存在する
uid = r.msg['X-Tumblr-User']
api_domain = domain
info['flag'] = flag
info['uid'] = uid
info['api_domain'] = api_domain
#print 'if'
elif r.status == 301:#ドメイン変更されたIDの場合
domain2 = urlparse(r.msg['Location']).netloc
conn2 = HTTPConnection(domain2)
conn2.request("GET", '/')
r2 = conn2.getresponse()
if r2.status == 200:
flag = 1
uid = r2.msg['X-Tumblr-User']
api_domain = urlparse(r.msg['Location']).netloc
info['flag'] = flag
info['uid'] = uid
info['api_domain'] = api_domain
#print 'elif'
else:#idが存在しないか不正な操作が行われた。
flag = 0
info['flag'] = flag
#print 'else'
conn.close()
except HTTPException, e:#不適切な操作を行った場合の出力を行う
flag = 0
info['flag'] = flag
#print 'exception', e
if flag == 1:#アカウントが存在する場合
#アカウントの投稿数をチェック
api_url = 'http://' + api_domain + '/api/read?num=1'
data = urllib2.urlopen(api_url).read()
elem = fromstring(data)
totalPosts = elem.find('posts').get('total')
info['totalPosts'] = totalPosts
#timezoneのチェック
timezone = elem.find('tumblelog').get('timezone')
info['timezone'] = timezone
#titleのチェック
title = elem.find('tumblelog').get('title')
info['tumblr_title'] = title
return info
"""
tumblrアカウントの投稿数をチェック
def addUserInfo(self):
user_info = self.user_info
api_domain = user_info['api_domain']
api_url = 'http://' + api_domain + '/api/read?num=1'
data = urllib2.urlopen(api_url).read()
elem = fromstring(data)
tNum = elem.find('posts').get('total')
return int(tNum)"""
"""
投稿数からAPIURLを生成する(指定された範囲)
"""
def genAPIurls(self, start, end):
uid = self.user_info['uid']
tNum = self.user_info['totalPosts']
num = 50
modPosts = (end - start) % num
repNum = ((end - start) - modPosts) / num
api_urls= []
for i in range(repNum+1):
if i == 0:
url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals()
api_urls.append(url)
elif i == repNum:
start = end - modPosts
url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(modPosts)s' % locals()
api_urls.append(url)
else:
start += num
url = 'http://%(uid)s.tumblr.com/api/read?start=%(start)s&num=%(num)s' % locals()
api_urls.append(url)
self.api_urls = api_urls
return api_urls
"""
APIの読み出しとtypeの振り分け
"""
def readAPI(self):
api_urls = self.api_urls
posts = []
for i in range(len(api_urls)):
data = urllib2.urlopen(api_urls[i]).read()
elem = fromstring(data)
for n in elem.findall('posts/post'):
ttype = n.get('type')
#///Type判別して、各関数へ投げる
if ttype == 'regular':
posts.append(self.text(n, ttype))
elif ttype =='photo':
posts.append(self.photo(n, ttype))
elif ttype =='quote':
posts.append(self.quote(n, ttype))
elif ttype =='link':
posts.append(self.link(n, ttype))
elif ttype =='conversation':
posts.append(self.conversation(n, ttype))
elif ttype =='audio':
posts.append(self.audio(n, ttype))
elif ttype =='video':
posts.append(self.video(n, ttype))
else:
print 'Sorry. This type is unsupported.'
return posts
"""
Textタイプのデータ加工
"""
def text(self, data, ttype):
stripper = self.stripper
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
regularTitle = data.findtext('.//regular-title/')#regular-title(optional)
regularBody = data.findtext('.//regular-body/')#regular-body
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"tags": tags,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"regularTitle": stripper.strip(regularTitle),
"regularBody": escape(regularBody)}
return post
"""
Photoタイプのデータ加工
"""
def photo(self, data, ttype):
stripper = self.stripper
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
photoLink = data.findtext('.//photo-link-url/')#photo-link-url(optional)
photoCaption = data.findtext('.//photo-caption/')#photo-caption(optional)
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
for i in data.findall('.//photo-url'):
size = i.get('max-width')
if size == '75':
num = i.text.find('tumblr.com/') + 11
photoType = i.text[-3:]#photoType jpg, gif, png, bmp
if len(i.text[num:-9]) < 24:
photoChar = None
photo_url = i.text.replace("_75sq", "_500")
else:
photoChar = i.text[num:num+24]#photoChar tumblr_xxxxxxxxxxxx
photo_url = 'http://%(userID)s.tumblr.com/photo/1280/%(pID)s/1/%(photoChar)s' % locals()
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"tags": tags,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"photoLink": stripper.strip(photoLink),
"photoCaption": escape(photoCaption),
"photoChar": photoChar,
"photoType": photoType,
"photo_url": photo_url}
return post
"""
Quoteタイプのデータ加工
"""
def quote(self, data, ttype):
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
quoteText = data.findtext('.//quote-text/')#quote-text
quoteSource = data.findtext('.//quote-source/')#quote-source(optional)
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"tags": tags,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"quoteText": escape(quoteText),
"quoteSource": escape(quoteSource)}
return post
"""
Linkタイプのデータ加工
"""
def link(self, data, ttype):
stripper = self.stripper
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
linkTitle = data.findtext('.//link-text/')#link title(optional)
linkUrl = data.findtext('.//link-url/')#link-url
linkDescription = data.findtext('.//link-description/')#link-description(optional)
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"tags": tags,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"linkTitle": stripper.strip(linkTitle),
"linkUrl": stripper.strip(linkUrl),
"linkDescription": escape(linkDescription)}
return post
"""
Chatタイプのデータ加工
"""
def chat(self, data, ttype):
stripper = self.stripper
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
conversationTitle = data.findtext('.//conversation-title/')#Conversation title(optional)
conversation = []
for z in data.findall('.//conversation'):
for y in z.getchildren():
name = y.get('name')
talk = y.text
conversation.append({name:talk})#Conversation
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"tags": tags,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"conversationTitle": stripper.strip(conversationTitle),
"conversation": conversation}
return post
"""
Audioタイプのデータ加工
"""
def audio(self, data, ttype):
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
audioCaption = data.findtext('.//audio-caption/')#audio-caption(optional)
audioPlayer = data.findtext('.//audio-player/')#audio-player
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"tags": tags,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"audioCaption": escape(audioCaption),
"audioPlayer": escape(audioPlayer)}
return post
"""
Videoタイプのデータ加工
"""
def video(self, data, ttype):
userID = self.user_info['uid']
pID = data.get('id')#post id
UnixTimestamp = data.get('unix-timestamp')#timestamp
reblogKey = data.get('reblog-key')#reblog key
videoCaption = data.findtext('.//video-caption/')#video-caption
videoPlayer = data.findtext('.//video-player/')#video-player
tags = []
for tag in data.findall('.//tag/'):
if tag.text:
tags.append(tag.text)
post = {"userID": userID,
"pID": pID,
"ttype": ttype,
"UnixTimestamp": UnixTimestamp,
"reblogKey": reblogKey,
"tags": tags,
"videoCaption": escape(videoCaption),
"videoPlayer": escape(videoPlayer)}
return post
"""
htmlタグの削除
stripper = Stripper()
print stripper.strip("<tag>some boring <a>text</a> goes here</tag>")
"""
import sgmllib
class Stripper(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
def strip(self, some_html):
self.theString = ""
self.feed(some_html)
self.close()
return self.theString
def handle_data(self, data):
self.theString += data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment