-
-
Save dalf/ec228b4aec97033de96ec92a504cf988 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-License-Identifier: AGPL-3.0-or-later | |
# lint: pylint | |
"""This is the implementation of the google images engine. | |
.. admonition:: Content-Security-Policy (CSP) | |
This engine needs to allow images from the `data URLs`_ (prefixed with the | |
``data:`` scheme):: | |
Header set Content-Security-Policy "img-src 'self' data: ;" | |
.. _data URLs: | |
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs | |
""" | |
from urllib.parse import urlencode | |
import re | |
import json | |
from lxml import html | |
from searx.utils import eval_xpath_getindex | |
from searx.engines.google import ( | |
get_lang_info, | |
time_range_dict, | |
detect_google_sorry, | |
) | |
# pylint: disable=unused-import | |
from searx.engines.google import supported_languages_url, _fetch_supported_languages | |
# pylint: enable=unused-import | |
# about | |
about = { | |
"website": 'https://images.google.com', | |
"wikidata_id": 'Q521550', | |
"official_api_documentation": 'https://developers.google.com/custom-search', | |
"use_official_api": False, | |
"require_api_key": False, | |
"results": 'HTML', | |
} | |
# engine dependent config | |
categories = ['images', 'web'] | |
paging = False | |
use_locale_domain = True | |
time_range_support = True | |
safesearch = True | |
filter_mapping = {0: 'images', 1: 'active', 2: 'active'} | |
quote_keys_regex = re.compile(r'([\{\s,])(\w+)(:)') | |
def js_variable_to_python(js_variable): | |
"""Convert a javascript variable into JSON and then load the value""" | |
# when in_string is not None, it contains the character that has opened the string | |
# either simple quote or double quote | |
in_string = None | |
# cut the string: | |
# r"""{ a:"f\"irst", c:'sec"ond'}""" | |
# becomes | |
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}'] | |
l = re.split(r'(["\'])', js_variable) | |
# previous part (to check the escape character antislash) | |
previous_p = "" | |
for i, p in enumerate(l): | |
# parse characters inside a ECMA string | |
if in_string: | |
# we are in a JS string: replace the colon by a temporary character | |
# so quote_keys_regex doesn't have to deal with colon inside the JS strings | |
l[i] = l[i].replace(':', chr(1)) | |
if in_string == "'": | |
# the JS string is delimited by simple quote. | |
# This is not supported by JSON. | |
# simple quote delimited string are converted to double quote delimited string | |
# here, inside a JS string, we escape the double quote | |
l[i] = l[i].replace('"', r'\"') | |
# deal with delimieters and escape character | |
if not in_string and p in ('"', "'"): | |
# we are not in string | |
# but p is double or simple quote | |
# that's the start of a new string | |
# replace simple quote by double quote | |
# (JSON doesn't support simple quote) | |
l[i] = '"' | |
in_string = p | |
continue | |
if p == in_string: | |
# we are in a string and the current part MAY close the string | |
if len(previous_p) > 0 and previous_p[-1] == '\\': | |
# there is an antislash just before: the ECMA string continue | |
continue | |
# the current p close the string | |
# replace simple quote by double quote | |
l[i] = '"' | |
in_string = None | |
# update previous_p | |
previous_p = p | |
# join the string | |
s = ''.join(l) | |
# add quote arround the key | |
# { a: 12 } | |
# becomes | |
# { "a": 12 } | |
s = quote_keys_regex.sub(r'\1"\2"\3', s) | |
# replace the surogate character by colon | |
s = s.replace(chr(1), ':') | |
# load the JSON and return the result | |
return json.loads(s) | |
def request(query, params): | |
"""Google-Video search request""" | |
lang_info = get_lang_info(params, supported_languages, language_aliases, False) | |
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) | |
query_url = ( | |
'https://' | |
+ lang_info['subdomain'] | |
+ '/search' | |
+ "?" | |
+ urlencode( | |
{ | |
'q': query, | |
'tbm': "isch", | |
**lang_info['params'], | |
'ie': "utf8", | |
'oe': "utf8", | |
'num': 30, | |
} | |
) | |
) | |
if params['time_range'] in time_range_dict: | |
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) | |
if params['safesearch']: | |
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | |
params['url'] = query_url | |
params['headers'].update(lang_info['headers']) | |
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | |
return params | |
def force_https(url): | |
if url.startswith('http://'): | |
return 'https://' + url[7:] | |
return url | |
def response(resp): | |
"""Get response from google's search request""" | |
results = [] | |
detect_google_sorry(resp) | |
# convert the text to dom | |
dom = html.fromstring(resp.text) | |
img_src_script = eval_xpath_getindex(dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text | |
data = js_variable_to_python(img_src_script.strip().replace('AF_initDataCallback(', '')[:-2]) | |
# there are not suggestions, rather subqueries | |
#for suggestion in data['data'][1][0][0][1]: | |
# results.append({'suggestion': suggestion[0]}) | |
for img in data['data'][31][0][12][2]: | |
if not isinstance(img, list) or len(img) == 0: | |
continue | |
if not isinstance(img[1], list) or len(img[1]) == 0: | |
continue | |
if len(img[1]) < 9: | |
continue | |
if not isinstance(img[1][9].get("2003"), list): | |
continue | |
img_1_9_2003 = img[1][9]["2003"] | |
img_fullsize = img[1][3] | |
img_thumbnail = img[1][2] | |
results.append( | |
{ | |
'url': img_1_9_2003[2], | |
'title': img_1_9_2003[3], | |
'source': img_1_9_2003[12], | |
'img_format': str(img_fullsize[2]) + ' x ' + str(img_fullsize[1]), | |
'img_src': force_https(img_fullsize[0]), | |
'thumbnail_src': force_https(img_thumbnail[0]), | |
'template': 'images.html', | |
} | |
) | |
return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment