Last active
June 14, 2016 15:48
-
-
Save zhkuo24/6545c6c7f4c3430f4d9b to your computer and use it in GitHub Desktop.
从百度贴吧页面下载图片
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re #正则表达式库 | |
import urllib.request | |
import urllib | |
import os | |
import pdb | |
def getHtml(url): | |
#req = urllib.request.Request(url) | |
page = urllib.request.urlopen(url) | |
html = page.read() | |
return html.decode('utf-8') | |
def getImag(html): | |
#要加括号,作为元组返回元组返回,抓取淘宝的图片png,看源码中图片的地址路径 | |
reg = r'http://imgsrc[^\s]*?jpg' | |
imgre = re.compile(reg) | |
imglist = imgre.findall(html) | |
x = 0 | |
path = r'F:\test_spide' | |
if not os.path.isdir(path): | |
os.makedirs(path) | |
paths = path + '\\' #保存在test路径下 | |
for imgurl in imglist: | |
urllib.request.urlretrieve(imgurl,'{}{}.jpg'.format(paths,x)) | |
x = x+1 | |
html = getHtml("http://tieba.baidu.com/p/2460150866") | |
#pdb.set_trace() # 运行到这里会自动暂停 | |
getImag(html) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import socket | |
import re | |
import sys | |
import os | |
import pdb | |
targetDir = r"F:\test_spide" | |
def destFile(path): | |
if not os.path.isdir(targetDir): | |
os.mkdir(targetDir) | |
pos = path.rindex('/') | |
t = os.path.join(targetDir, path[pos+1:]) | |
return t | |
#print(t) | |
if __name__ == "__main__": | |
#hostname = "http://www.douban.com" | |
hostname = "http://tieba.baidu.com/p/2460150866" | |
req = urllib.request.Request(hostname) | |
webpage = urllib.request.urlopen(req) | |
contentBytes = webpage.read() | |
pdb.set_trace() # 运行到这里会自动暂停 | |
for link, t in set(re.findall(r'(http://imgsrc[^\s]*?(jpg|png|gif))', str(contentBytes))): | |
print(link) | |
#pdb.set_trace() # 运行到这里会自动暂停 | |
urllib.request.urlretrieve(link, destFile(link)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment