Skip to content

Instantly share code, notes, and snippets.

@blacknon
Last active January 30, 2023 02:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blacknon/a184df81ece333f93ca7c4c4eabcf54e to your computer and use it in GitHub Desktop.
Save blacknon/a184df81ece333f93ca7c4c4eabcf54e to your computer and use it in GitHub Desktop.
googleイメージ検索を行う検証用スクリプト(python)
#!/usr/bin/env python3
# -*- encoding: UTF-8 -*-
import chromedriver_autoinstaller
import time
import re
import demjson
from bs4 import BeautifulSoup
from selenium import webdriver
def click_img(we):
None
def main():
# クリックなど動作後に待つ時間(秒)
sleep_between_interactions = 2
# 「検索結果をもっと表示」ボタン
more_button_class_name = 'mye4qd'
# ダウンロードする枚数
# download_num = 100
# 検索ワード
query = "水瀬いのり"
# 画像検索用のurl
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
# use chrome driver
chromedriver_autoinstaller.install()
# サムネイル画像のURL取得
driver = webdriver.Chrome()
driver.get(search_url.format(q=query))
# スクロールダウンと「検索結果をもっと表示」のクリックを実行
for i in range(10):
# スクロールダウン
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
# 待機処理
time.sleep(sleep_between_interactions)
# 「検索結果をもっと表示」ボタンをクリック
try:
#
driver.find_element_by_class_name(more_button_class_name).click()
# 待機処理
time.sleep(sleep_between_interactions)
except Exception:
pass
# サムネイル画像のリンクを取得(ここでコケる場合はセレクタを実際に確認して変更する)
# thumbnail_results = driver.find_elements_by_css_selector("img.rg_i")
# print(len(thumbnail_results))
#
page_source = driver.page_source
# 前処理リクエストから、crumbパラメータの値を取得する(正規表現)
# pattern = r'\["http[^"]+",'
# data = re.findall(pattern, page_source)
soup = BeautifulSoup(page_source, 'lxml')
image_divs = soup.find_all('script')
result = list()
for div in image_divs:
# TODO: reintegrate the constraints?
# meta = json.loads(div.text)
# if 'ou' in meta and 'ity' in meta and meta['ity'] is not "" and "lookaside.fbsbx.com" not in meta['ou']:
# yield dict(file_url=meta['ou'])
txt = div.string
if txt is None or not txt.startswith('AF_initDataCallback'):
continue
if 'ds:1' not in txt:
continue
txt = re.sub(r"^AF_initDataCallback\(({.*key: 'ds:\d',.+, data:.+})\);$",
"\\1", txt, 0, re.DOTALL)
meta = demjson.decode(txt)['data']
data = meta[31][0][12][2]
# uris = [img[1][3][0] for img in data if img[0] == 1]
for img in data:
if img[0] == 1:
uri = img[1][3][0]
puri = img[1][9]['2003'][2]
title = img[1][9]['2003'][3]
result.append({'link': uri, 'title': title, 'pagelink': puri})
print(result)
driver.quit()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment