Last active
January 20, 2017 08:24
-
-
Save tanaikech/d7492d96d11ff6786177dfb3412fbc73 to your computer and use it in GitHub Desktop.
コマンドラインからローカルPCまたはweb上の画像データを使ってGoogleの類似画像検索を行う ref: http://qiita.com/tanaike/items/dd89e41bf77bc3d96f51
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python スクリプト.py -f ファイル名 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python スクリプト.py -u URL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python スクリプト.py -s テキストデータ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
from bs4 import BeautifulSoup | |
import json | |
import os | |
import re | |
import requests | |
import sys | |
class griSearch(object): | |
baseurl = 'https://www.google.com' | |
headers = {'User-Agent': '### ここにブラウザのユーザエージェントを入力 ###'} | |
extlist = ['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico'] | |
def searchByUrl(self, indat): | |
ext = os.path.splitext(indat)[1].replace('.', '') | |
if indat.find('http') == -1: | |
print('Error: Bad URL') | |
sys.exit(1) | |
self.exter(ext) | |
indat = self.baseurl + "/searchbyimage?&image_url=" + indat | |
r = self.getUrlFromUrl(indat) | |
return set(self.getUrl(r)) | |
def searchByFile(self, indat): | |
try: | |
datf = open(indat, 'rb') | |
except: | |
print('Error: Cannot open ' + indat) | |
sys.exit(1) | |
ext = os.path.splitext(indat)[1].replace('.', '') | |
self.exter(ext) | |
upurl = self.baseurl + "/searchbyimage/upload" | |
files = {'encoded_image': (os.path.split(indat)[1], datf, 'image/' + ext, {'Expires': '0'})} | |
r = self.getUrlFromFile(upurl, files) | |
return set(self.getUrl(r)) | |
def exter(self, ext): | |
if ext not in self.extlist: | |
print('Error: You can search using images of "%s".' % ', '.join(self.extlist)) | |
sys.exit(1) | |
def getUrlFromUrl(self, upimgurl): | |
return requests.get(upimgurl, headers=self.headers) | |
def getUrlFromFile(self, upurl, files): | |
return requests.post(upurl, files=files, headers=self.headers) | |
def getUrl(self, r): | |
soup = BeautifulSoup(r.content, 'html.parser') | |
simgurl = self.baseurl + soup.findAll(class_='iu-card-header')[0].get('href') | |
if simgurl == self.baseurl + 'None': | |
print('Error: server error.') | |
sys.exit(1) | |
r = requests.get(simgurl, headers=self.headers) | |
soup = BeautifulSoup(r.content, 'html.parser') | |
imgs = soup.findAll("div", {"class": "rg_meta"}) | |
urlar = [json.loads(img.get_text())["ou"] for img in imgs] | |
return urlar | |
def getImgs(self, imgs): | |
errorfiles = 0 | |
outfiles = 0 | |
with open(imgs, 'rt') as f: | |
for url in f: | |
if len(url.strip()) == 0: | |
continue | |
r = requests.get(url, headers=self.headers, stream=True) | |
if r.status_code != 200: | |
errorfiles += 1 | |
continue | |
turl0 = re.sub(r'http(s)?://', '', url) | |
turl1 = re.sub(r'[\?|&|=|@|~]', '_', turl0) | |
turl2 = turl1.split("/") | |
turl3 = [v.replace('.', '-') for i, v in enumerate(turl2) if i != len(turl2) - 1] | |
turl4 = '_'.join(turl3) + '_' + turl2[len(turl2) - 1] | |
with open(turl4.strip(), 'wb') as g: | |
g.write(r.content) | |
outfiles += 1 | |
print('Number of output images : %d, error URLs : %d \n' % (outfiles, errorfiles)) | |
if __name__ == "__main__": | |
argvs = sys.argv | |
if len(argvs) >= 3: | |
s = griSearch() | |
if argvs[1] == '-u': | |
result = s.searchByUrl(argvs[2]) | |
for i in result: | |
sys.stdout.write(i + '\n') | |
elif argvs[1] == '-f': | |
result = s.searchByFile(argvs[2]) | |
for i in result: | |
sys.stdout.write(i + '\n') | |
elif argvs[1] == '-s': | |
s.getImgs(argvs[2]) | |
else: | |
msg = '''Usage: | |
python {script} [option] [file | URL] | |
Version: 1.00 | |
griSearch is a tool for retrieving similar images from an image on | |
local PC or web using Google. Image formats which can be searched | |
are jpg, gif, png, bmp, svg, webp and ico. The result is output as | |
a list of image URLs. | |
Options : | |
-f Search similar images using image FILE in local PC. | |
-u Search similar images using image URL on web. | |
-s Output image dat from text data with image URLs | |
Samples : | |
python {script} -f sample.png | |
python {script} -u http://www.sample.com/sample.png | |
python {script} -s sample.txt | |
''' | |
print(msg.format(script=argvs[0])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment