Last active
December 24, 2015 18:29
-
-
Save wenLiangcan/6843820 to your computer and use it in GitHub Desktop.
Modified from http://paste.pound-python.org/show/28965/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Filename: get_tieba_pic.py | |
from __future__ import print_function | |
import os | |
import re | |
import sys | |
if sys.version_info[0] == 2: | |
import urllib2 | |
input = raw_input | |
else: | |
import urllib.request as urllib2 | |
PAGE_LINK = 'http://tieba.baidu.com/p/{post_id}?see_lz=1&pn={page_num}' | |
def create_folder(post_id): | |
dir_name = 'tieba_{}'.format(post_id) | |
if not os.path.exists(dir_name): | |
os.mkdir(dir_name) | |
return dir_name | |
def http_get(url): | |
head = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:19.0) Gecko/20100101 Firefox/19.0", | |
"Referer": 'http://www.baidu.com/', | |
} | |
req = urllib2.Request(url, headers=head) | |
try: | |
res = urllib2.urlopen(req).read() | |
except Exception as e: | |
print(e) | |
return res | |
def count_pages(html): | |
ptn = r'<span class="red">(.*?)</span>' | |
num = re.findall(ptn, html) | |
return int(num[0]) | |
def find_pic_urls(html): | |
ptn = r'class="BDE_Image".*?sign=.*?/(.*?)"' | |
img_urls = re.findall(ptn, html) | |
pref = 'http://imgsrc.baidu.com/forum/pic/item/' | |
return [pref+l for l in img_urls] | |
def get_post_id(): | |
if len(sys.argv) == 1: | |
user_input = input('Enter post url --> ') | |
else: | |
user_input = sys.argv[1] | |
ptn = r'^http://tieba.baidu.com/p/(\d+).*?' | |
post_id = re.findall(ptn, user_input)[0] | |
return post_id | |
def main(): | |
post_id = get_post_id() | |
html = str(http_get(PAGE_LINK.format(post_id=post_id, page_num=0))) | |
pages = count_pages(html) | |
pic_urls = find_pic_urls(html) | |
if pages > 1: | |
for i in range(1, pages): | |
pic_urls += find_pic_urls( | |
str(http_get(PAGE_LINK.format(post_id=post_id, page_num=i))) | |
) | |
folder = create_folder(post_id) | |
pics = len(pic_urls) | |
print('Found {} pictures in {} pages'.format(pics, pages)) | |
print('Start downloading, pictures will be saved to {}'.format(folder)) | |
print() | |
for i, l in enumerate(pic_urls): | |
filename = ('{:0>%dd}.{}' % len(str(pics))).format(i, l.split('.')[-1]) | |
print('Downloading {} ...'.format(filename)) | |
data = http_get(l) | |
with open('{}/{}'.format(folder, filename), 'wb') as f: | |
f.write(data) | |
print('Finished!') | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment