Skip to content

Instantly share code, notes, and snippets.

@HuangFJ
Created February 8, 2013 07:11
Show Gist options
  • Save HuangFJ/4737224 to your computer and use it in GitHub Desktop.
Save HuangFJ/4737224 to your computer and use it in GitHub Desktop.
采集微博的动画gif图片
from distutils.core import setup
import py2exe
data_files = [("config.ini"),("test.db")]
includes = ["mechanize", "simplejson", "PIL.Image"]
options = {"py2exe":
{ "compressed": 1,
"optimize": 2,
"includes": includes,
"bundle_files": 1
}
}
setup(
version = "0.1.0",
options = options,
data_files = data_files,
zipfile=None,
console=[{"script": "test.py"}],
# encoding = utf-8
import mechanize
import os, fnmatch
from cStringIO import StringIO
import sqlite3
import cookielib
import simplejson
import re
import ConfigParser
from threading import Thread
from Queue import Queue
import urllib2
import socket
from PIL import Image
import traceback
import time
timeout = 300
socket.setdefaulttimeout(timeout)
def locate(pattern, root=os.curdir):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in fnmatch.filter(files, pattern):
yield os.path.join(path, filename)
def read_cookie(browser, site):
site = site.split(',')
cookie_path = None
if 'LOCALAPPDATA' not in os.environ:
os.environ['LOCALAPPDATA'] = os.path.join(os.environ['USERPROFILE'], 'Local Settings', 'Application Data')
if browser == 'chrome':
#C:\Users\Jon\AppData\Local\Google\Chrome\User Data\Default\Cookies
chrome_cookie = os.path.join(os.environ['LOCALAPPDATA'],'Google','Chrome','User Data','Default','Cookies')
if os.path.isfile(chrome_cookie):
cookie_path = chrome_cookie
elif browser == 'firefox':
#C:\Users\Jon\AppData\Roaming\Mozilla\Firefox\Profiles\5t3akq0c.default\cookies.sqlite
firefox_cookies = [x for x in locate('cookies.sqlite', os.path.join(os.environ['APPDATA'],'Mozilla','Firefox','Profiles'))]
if firefox_cookies:
cookie_path = firefox_cookies[0]
if cookie_path is None:
raise Exception(browser.title() + ' is not installed.')
try:
cookie_path.decode('utf8')
except:
cookie_path = cookie_path.decode('gbk').encode('utf8')
con = sqlite3.connect(cookie_path)
con.text_factory = str
cur = con.cursor()
if browser == 'firefox':
sql = "select host, path, isSecure, expiry, name, value from moz_cookies where"
for item in site:
sql += " host like '%" + item + "' or"
sql = sql[0:-3]
elif browser == 'chrome':
sql = "select host_key, path, secure, expires_utc, name, value from cookies where"
for item in site:
sql += " host_key like '%" + item + "' or"
sql = sql[0:-3]
cur.execute(sql)
ftstr = ["FALSE","TRUE"]
s = StringIO()
s.write('''\
# Netscape HTTP Cookie File
# http://www.netscape.com/newsref/std/cookie_spec.html
# This is a generated file! Do not edit.
''')
for item in cur.fetchall():
try:
s.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
item[0], ftstr[item[0].startswith('.')], item[1],
ftstr[item[2]], item[3], item[4], item[5]))
except UnicodeError:
continue
s.seek(0)
cookie_jar = cookielib.MozillaCookieJar()
cookie_jar._really_load(s, '', True, True)
return cookie_jar
def download_anigif(src):
path = os.path.join('images', os.path.basename(src))
if os.path.isfile(path): return
data = urllib2.urlopen(src).read()
string_io = StringIO(data)
image = Image.open(string_io)
try: image.seek(1)
except EOFError: return
del image
string_io.close()
#image.format, image.size, image.mode
with open(path, 'wb') as fp:
fp.write(data)
print src
class Worker(Thread):
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except:
traceback.print_exc()
self.tasks.task_done()
class ThreadPool:
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
self.tasks.put((func, args, kargs))
def wait_completion(self):
self.tasks.join()
if __name__ == '__main__':
try: os.mkdir('images')
except: pass
config = ConfigParser.ConfigParser()
config.read('config.ini')
cookiejar = read_cookie(config.get('weibo', 'browser'), config.get('weibo', 'host'))
br = mechanize.Browser()
br.set_cookiejar(cookiejar)
pool = ThreadPool(20)
con = sqlite3.connect('test.db')
cur = con.cursor()
uid_str = config.get('weibo', 'uid')
uid_list = uid_str.split(',')
uid_idx = 0
uid_count = len(uid_list)
page = 1
while True:
uid = uid_list[uid_idx]
try:
result = br.open('http://www.weibo.com/aj/mblog/mbloglist?count=15&page=%s&uid=%s' % (page, uid), timeout=60).read()
except socket.timeout:
time.sleep(10)
continue
page += 1
try:
result = simplejson.loads(result)
except:
raise Exception('You do not sign in.')
if result['code'] == '100000':
if result['data'].find('mid=') == -1:
page = 1
if (uid_idx + 1) < uid_count:
uid_idx = uid_idx + 1
else:
uid_idx = 0
else:
srcs = re.findall(r'http://[^"]+sinaimg\.cn[^"]+\.gif', result['data'])
for src in srcs:
src = src.replace('thumbnail', 'large')
cur.execute('SELECT src FROM images WHERE src=?', (src,))
if cur.fetchone() is None:
pool.add_task(download_anigif, src)
cur.execute('INSERT INTO images (src) VALUES (?)', (src,))
con.commit()
pool.wait_completion()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment