Skip to content

Instantly share code, notes, and snippets.

@wenLiangcan
Last active December 24, 2015 18:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wenLiangcan/6843820 to your computer and use it in GitHub Desktop.
Save wenLiangcan/6843820 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# Filename: get_tieba_pic.py
from __future__ import print_function
import os
import re
import sys
if sys.version_info[0] == 2:
import urllib2
input = raw_input
else:
import urllib.request as urllib2
PAGE_LINK = 'http://tieba.baidu.com/p/{post_id}?see_lz=1&pn={page_num}'
def create_folder(post_id):
dir_name = 'tieba_{}'.format(post_id)
if not os.path.exists(dir_name):
os.mkdir(dir_name)
return dir_name
def http_get(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:19.0) Gecko/20100101 Firefox/19.0",
"Referer": 'http://www.baidu.com/',
}
req = urllib2.Request(url, headers=head)
try:
res = urllib2.urlopen(req).read()
except Exception as e:
print(e)
return res
def count_pages(html):
ptn = r'<span class="red">(.*?)</span>'
num = re.findall(ptn, html)
return int(num[0])
def find_pic_urls(html):
ptn = r'class="BDE_Image".*?sign=.*?/(.*?)"'
img_urls = re.findall(ptn, html)
pref = 'http://imgsrc.baidu.com/forum/pic/item/'
return [pref+l for l in img_urls]
def get_post_id():
if len(sys.argv) == 1:
user_input = input('Enter post url --> ')
else:
user_input = sys.argv[1]
ptn = r'^http://tieba.baidu.com/p/(\d+).*?'
post_id = re.findall(ptn, user_input)[0]
return post_id
def main():
post_id = get_post_id()
html = str(http_get(PAGE_LINK.format(post_id=post_id, page_num=0)))
pages = count_pages(html)
pic_urls = find_pic_urls(html)
if pages > 1:
for i in range(1, pages):
pic_urls += find_pic_urls(
str(http_get(PAGE_LINK.format(post_id=post_id, page_num=i)))
)
folder = create_folder(post_id)
pics = len(pic_urls)
print('Found {} pictures in {} pages'.format(pics, pages))
print('Start downloading, pictures will be saved to {}'.format(folder))
print()
for i, l in enumerate(pic_urls):
filename = ('{:0>%dd}.{}' % len(str(pics))).format(i, l.split('.')[-1])
print('Downloading {} ...'.format(filename))
data = http_get(l)
with open('{}/{}'.format(folder, filename), 'wb') as f:
f.write(data)
print('Finished!')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment