Skip to content

Instantly share code, notes, and snippets.

@zhkuo24
Last active June 14, 2016 15:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zhkuo24/6545c6c7f4c3430f4d9b to your computer and use it in GitHub Desktop.
Save zhkuo24/6545c6c7f4c3430f4d9b to your computer and use it in GitHub Desktop.
从百度贴吧页面下载图片
# -*- coding: utf-8 -*-
import re #正则表达式库
import urllib.request
import urllib
import os
import pdb
def getHtml(url):
#req = urllib.request.Request(url)
page = urllib.request.urlopen(url)
html = page.read()
return html.decode('utf-8')
def getImag(html):
#要加括号,作为元组返回元组返回,抓取淘宝的图片png,看源码中图片的地址路径
reg = r'http://imgsrc[^\s]*?jpg'
imgre = re.compile(reg)
imglist = imgre.findall(html)
x = 0
path = r'F:\test_spide'
if not os.path.isdir(path):
os.makedirs(path)
paths = path + '\\' #保存在test路径下
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'{}{}.jpg'.format(paths,x))
x = x+1
html = getHtml("http://tieba.baidu.com/p/2460150866")
#pdb.set_trace() # 运行到这里会自动暂停
getImag(html)
import urllib.request
import socket
import re
import sys
import os
import pdb
targetDir = r"F:\test_spide"
def destFile(path):
if not os.path.isdir(targetDir):
os.mkdir(targetDir)
pos = path.rindex('/')
t = os.path.join(targetDir, path[pos+1:])
return t
#print(t)
if __name__ == "__main__":
#hostname = "http://www.douban.com"
hostname = "http://tieba.baidu.com/p/2460150866"
req = urllib.request.Request(hostname)
webpage = urllib.request.urlopen(req)
contentBytes = webpage.read()
pdb.set_trace() # 运行到这里会自动暂停
for link, t in set(re.findall(r'(http://imgsrc[^\s]*?(jpg|png|gif))', str(contentBytes))):
print(link)
#pdb.set_trace() # 运行到这里会自动暂停
urllib.request.urlretrieve(link, destFile(link))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment