zhkuo24/getimag.py

## getimag.py
# -*- coding: utf-8 -*-
import re #正则表达式库
import urllib.request
import urllib
import os
import pdb
def getHtml(url):
	#req = urllib.request.Request(url)
	page = urllib.request.urlopen(url)
	html = page.read()
	return html.decode('utf-8')
def getImag(html):
#要加括号，作为元组返回元组返回，抓取淘宝的图片png，看源码中图片的地址路径
	reg = r'http://imgsrc[^\s]*?jpg'
	imgre = re.compile(reg)
	imglist = imgre.findall(html)
	x = 0
	path = r'F:\test_spide'
	if not os.path.isdir(path):
		os.makedirs(path)
	paths = path + '\\' #保存在test路径下
	for imgurl in imglist:
		urllib.request.urlretrieve(imgurl,'{}{}.jpg'.format(paths,x))
		x = x+1
html = getHtml("http://tieba.baidu.com/p/2460150866")
#pdb.set_trace() # 运行到这里会自动暂停
getImag(html)

## getimag1.py
import urllib.request
import socket
import re
import sys
import os
import pdb
targetDir = r"F:\test_spide"
def destFile(path):
    if not os.path.isdir(targetDir):
        os.mkdir(targetDir)
    pos = path.rindex('/')
    t = os.path.join(targetDir, path[pos+1:])
    return t
    #print(t)
if __name__ == "__main__":
    #hostname = "http://www.douban.com"
    hostname = "http://tieba.baidu.com/p/2460150866"
    req = urllib.request.Request(hostname)
    webpage = urllib.request.urlopen(req)
    contentBytes = webpage.read()
    pdb.set_trace() # 运行到这里会自动暂停
    for link, t in set(re.findall(r'(http://imgsrc[^\s]*?(jpg|png|gif))', str(contentBytes))):
        print(link)
        #pdb.set_trace() # 运行到这里会自动暂停
        urllib.request.urlretrieve(link, destFile(link))
	# -- coding: utf-8 --
	import re #正则表达式库
	import urllib.request
	import urllib
	import os
	import pdb
	def getHtml(url):
	#req = urllib.request.Request(url)
	page = urllib.request.urlopen(url)
	html = page.read()
	return html.decode('utf-8')
	def getImag(html):
	#要加括号，作为元组返回元组返回，抓取淘宝的图片png，看源码中图片的地址路径
	reg = r'http://imgsrc[^\s]*?jpg'
	imgre = re.compile(reg)
	imglist = imgre.findall(html)
	x = 0
	path = r'F:\test_spide'
	if not os.path.isdir(path):
	os.makedirs(path)
	paths = path + '\\' #保存在test路径下
	for imgurl in imglist:
	urllib.request.urlretrieve(imgurl,'{}{}.jpg'.format(paths,x))
	x = x+1
	html = getHtml("http://tieba.baidu.com/p/2460150866")
	#pdb.set_trace() # 运行到这里会自动暂停
	getImag(html)
	import urllib.request
	import socket
	import re
	import sys
	import os
	import pdb
	targetDir = r"F:\test_spide"
	def destFile(path):
	if not os.path.isdir(targetDir):
	os.mkdir(targetDir)
	pos = path.rindex('/')
	t = os.path.join(targetDir, path[pos+1:])
	return t
	#print(t)
	if __name__ == "__main__":
	#hostname = "http://www.douban.com"
	hostname = "http://tieba.baidu.com/p/2460150866"
	req = urllib.request.Request(hostname)
	webpage = urllib.request.urlopen(req)
	contentBytes = webpage.read()
	pdb.set_trace() # 运行到这里会自动暂停
	for link, t in set(re.findall(r'(http://imgsrc[^\s]*?(jpg\|png\|gif))', str(contentBytes))):
	print(link)
	#pdb.set_trace() # 运行到这里会自动暂停
	urllib.request.urlretrieve(link, destFile(link))