Skip to content

Instantly share code, notes, and snippets.

@tanaikech
Last active January 20, 2017 08:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tanaikech/d7492d96d11ff6786177dfb3412fbc73 to your computer and use it in GitHub Desktop.
Save tanaikech/d7492d96d11ff6786177dfb3412fbc73 to your computer and use it in GitHub Desktop.
コマンドラインからローカルPCまたはweb上の画像データを使ってGoogleの類似画像検索を行う ref: http://qiita.com/tanaike/items/dd89e41bf77bc3d96f51
$ python スクリプト.py -f ファイル名
$ python スクリプト.py -u URL
$ python スクリプト.py -s テキストデータ
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import json
import os
import re
import requests
import sys
class griSearch(object):
baseurl = 'https://www.google.com'
headers = {'User-Agent': '### ここにブラウザのユーザエージェントを入力 ###'}
extlist = ['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico']
def searchByUrl(self, indat):
ext = os.path.splitext(indat)[1].replace('.', '')
if indat.find('http') == -1:
print('Error: Bad URL')
sys.exit(1)
self.exter(ext)
indat = self.baseurl + "/searchbyimage?&image_url=" + indat
r = self.getUrlFromUrl(indat)
return set(self.getUrl(r))
def searchByFile(self, indat):
try:
datf = open(indat, 'rb')
except:
print('Error: Cannot open ' + indat)
sys.exit(1)
ext = os.path.splitext(indat)[1].replace('.', '')
self.exter(ext)
upurl = self.baseurl + "/searchbyimage/upload"
files = {'encoded_image': (os.path.split(indat)[1], datf, 'image/' + ext, {'Expires': '0'})}
r = self.getUrlFromFile(upurl, files)
return set(self.getUrl(r))
def exter(self, ext):
if ext not in self.extlist:
print('Error: You can search using images of "%s".' % ', '.join(self.extlist))
sys.exit(1)
def getUrlFromUrl(self, upimgurl):
return requests.get(upimgurl, headers=self.headers)
def getUrlFromFile(self, upurl, files):
return requests.post(upurl, files=files, headers=self.headers)
def getUrl(self, r):
soup = BeautifulSoup(r.content, 'html.parser')
simgurl = self.baseurl + soup.findAll(class_='iu-card-header')[0].get('href')
if simgurl == self.baseurl + 'None':
print('Error: server error.')
sys.exit(1)
r = requests.get(simgurl, headers=self.headers)
soup = BeautifulSoup(r.content, 'html.parser')
imgs = soup.findAll("div", {"class": "rg_meta"})
urlar = [json.loads(img.get_text())["ou"] for img in imgs]
return urlar
def getImgs(self, imgs):
errorfiles = 0
outfiles = 0
with open(imgs, 'rt') as f:
for url in f:
if len(url.strip()) == 0:
continue
r = requests.get(url, headers=self.headers, stream=True)
if r.status_code != 200:
errorfiles += 1
continue
turl0 = re.sub(r'http(s)?://', '', url)
turl1 = re.sub(r'[\?|&|=|@|~]', '_', turl0)
turl2 = turl1.split("/")
turl3 = [v.replace('.', '-') for i, v in enumerate(turl2) if i != len(turl2) - 1]
turl4 = '_'.join(turl3) + '_' + turl2[len(turl2) - 1]
with open(turl4.strip(), 'wb') as g:
g.write(r.content)
outfiles += 1
print('Number of output images : %d, error URLs : %d \n' % (outfiles, errorfiles))
if __name__ == "__main__":
argvs = sys.argv
if len(argvs) >= 3:
s = griSearch()
if argvs[1] == '-u':
result = s.searchByUrl(argvs[2])
for i in result:
sys.stdout.write(i + '\n')
elif argvs[1] == '-f':
result = s.searchByFile(argvs[2])
for i in result:
sys.stdout.write(i + '\n')
elif argvs[1] == '-s':
s.getImgs(argvs[2])
else:
msg = '''Usage:
python {script} [option] [file | URL]
Version: 1.00
griSearch is a tool for retrieving similar images from an image on
local PC or web using Google. Image formats which can be searched
are jpg, gif, png, bmp, svg, webp and ico. The result is output as
a list of image URLs.
Options :
-f Search similar images using image FILE in local PC.
-u Search similar images using image URL on web.
-s Output image dat from text data with image URLs
Samples :
python {script} -f sample.png
python {script} -u http://www.sample.com/sample.png
python {script} -s sample.txt
'''
print(msg.format(script=argvs[0]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment