Skip to content

Instantly share code, notes, and snippets.

@luzihang123
Created November 2, 2018 09:23
Show Gist options
  • Save luzihang123/77c7f02304447dd2a273cca526a27e1c to your computer and use it in GitHub Desktop.
Save luzihang123/77c7f02304447dd2a273cca526a27e1c to your computer and use it in GitHub Desktop.
爬虫敏感图片的识别与过滤
from functools import reduce
from PIL import Image
import requests
# 计算pHash(只需要三行):
def phash(img):
img = img.resize((8, 8), Image.ANTIALIAS).convert('L')
avg = reduce(lambda x, y: x + y, img.getdata()) / 64.
return reduce(
lambda x, y: x | (y[1] << y[0]),
enumerate(map(lambda i: 0 if i < avg else 1, img.getdata())),
0
)
# 计算汉明距离:
def hamming_distance(a, b):
return bin(a^b).count('1')
# 计算两个图片是否相似:
def is_imgs_similar(img1,img2):
return True if hamming_distance(phash(img1),phash(img2)) <= 5 else False
# 打开本地存放一张敏感图片;
# 本次为了方便演示,从新浪图床拉下一张1024X1024的图片,保存命名为sensitive.jpg
sensitive_url="https://ws4.sinaimg.cn/large/006tNbRwgy1fwttj7bi36j30sg0sgwm0.jpg"
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5"}
pic = requests.get(sensitive_url, headers=headers, timeout=300)
if pic.status_code == 200:
with open("sensitive.jpg", 'wb') as f:
f.write(pic.content)
sensitive_pic = Image.open("sensitive.jpg")
# 爬虫获取的图片
target_url="https://ws3.sinaimg.cn/large/006tNbRwgy1fwttsauo6jj30h80han0y.jpg"
pic = requests.get(target_url, headers=headers, timeout=300)
if pic.status_code == 200:
with open("target.jpg", 'wb') as f:
f.write(pic.content)
target_pic = Image.open("target.jpg")
# 判断爬虫获取的图片和敏感图片是否相似
if is_imgs_similar(target_pic, sensitive_pic):
print("2张图片相似,替换敏感图片为”优雅的python“:{}".format("https://ws2.sinaimg.cn/large/006tNbRwgy1fw9yjmot3uj30y60y6q40.jpg"))
else:
print("不相似")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment