dalf/google_images.py Secret

## google_images.py
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google images engine.

.. admonition:: Content-Security-Policy (CSP)

   This engine needs to allow images from the `data URLs`_ (prefixed with the
   ``data:`` scheme)::

       Header set Content-Security-Policy "img-src 'self' data: ;"

.. _data URLs:
   https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
"""

from urllib.parse import urlencode
import re
import json
from lxml import html

from searx.utils import eval_xpath_getindex
from searx.engines.google import (
    get_lang_info,
    time_range_dict,
    detect_google_sorry,
)

# pylint: disable=unused-import
from searx.engines.google import supported_languages_url, _fetch_supported_languages

# pylint: enable=unused-import

# about
about = {
    "website": 'https://images.google.com',
    "wikidata_id": 'Q521550',
    "official_api_documentation": 'https://developers.google.com/custom-search',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

# engine dependent config
categories = ['images', 'web']
paging = False
use_locale_domain = True
time_range_support = True
safesearch = True

filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
quote_keys_regex = re.compile(r'([\{\s,])(\w+)(:)')


def js_variable_to_python(js_variable):
    """Convert a javascript variable into JSON and then load the value"""
    # when in_string is not None, it contains the character that has opened the string
    # either simple quote or double quote
    in_string = None
    # cut the string:
    # r"""{ a:"f\"irst", c:'sec"ond'}"""
    # becomes
    # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
    l = re.split(r'(["\'])', js_variable)
    # previous part (to check the escape character antislash)
    previous_p = ""
    for i, p in enumerate(l):
        # parse characters inside a ECMA string
        if in_string:
            # we are in a JS string: replace the colon by a temporary character
            # so quote_keys_regex doesn't have to deal with colon inside the JS strings
            l[i] = l[i].replace(':', chr(1))
            if in_string == "'":
                # the JS string is delimited by simple quote.
                # This is not supported by JSON.
                # simple quote delimited string are converted to double quote delimited string
                # here, inside a JS string, we escape the double quote
                l[i] = l[i].replace('"', r'\"')

        # deal with delimieters and escape character
        if not in_string and p in ('"', "'"):
            # we are not in string
            # but p is double or simple quote
            # that's the start of a new string
            # replace simple quote by double quote
            # (JSON doesn't support simple quote)
            l[i] = '"'
            in_string = p
            continue
        if p == in_string:
            # we are in a string and the current part MAY close the string
            if len(previous_p) > 0 and previous_p[-1] == '\\':
                # there is an antislash just before: the ECMA string continue
                continue
            # the current p close the string
            # replace simple quote by double quote
            l[i] = '"'
            in_string = None
        # update previous_p
        previous_p = p
    # join the string
    s = ''.join(l)
    # add quote arround the key
    # { a: 12 }
    # becomes
    # { "a": 12 }
    s = quote_keys_regex.sub(r'\1"\2"\3', s)
    # replace the surogate character by colon
    s = s.replace(chr(1), ':')
    # load the JSON and return the result
    return json.loads(s)


def request(query, params):
    """Google-Video search request"""

    lang_info = get_lang_info(params, supported_languages, language_aliases, False)
    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])

    query_url = (
        'https://'
        + lang_info['subdomain']
        + '/search'
        + "?"
        + urlencode(
            {
                'q': query,
                'tbm': "isch",
                **lang_info['params'],
                'ie': "utf8",
                'oe': "utf8",
                'num': 30,
            }
        )
    )

    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
    params['url'] = query_url

    params['headers'].update(lang_info['headers'])
    params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    return params


def force_https(url):
    if url.startswith('http://'):
        return 'https://' + url[7:]
    return url


def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_src_script = eval_xpath_getindex(dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
    data = js_variable_to_python(img_src_script.strip().replace('AF_initDataCallback(', '')[:-2])
    # there are not suggestions, rather subqueries
    #for suggestion in data['data'][1][0][0][1]:
    #    results.append({'suggestion': suggestion[0]})
    for img in data['data'][31][0][12][2]:
        if not isinstance(img, list) or len(img) == 0:
            continue
        if not isinstance(img[1], list) or len(img[1]) == 0:
            continue
        if len(img[1]) < 9:
            continue
        if not isinstance(img[1][9].get("2003"), list):
            continue

        img_1_9_2003 = img[1][9]["2003"]
        img_fullsize = img[1][3]
        img_thumbnail = img[1][2]
        results.append(
            {
                'url': img_1_9_2003[2],
                'title': img_1_9_2003[3],
                'source': img_1_9_2003[12],
                'img_format': str(img_fullsize[2]) + ' x ' + str(img_fullsize[1]),
                'img_src': force_https(img_fullsize[0]),
                'thumbnail_src': force_https(img_thumbnail[0]),
                'template': 'images.html',
            }
        )
    return results
	# SPDX-License-Identifier: AGPL-3.0-or-later
	# lint: pylint
	"""This is the implementation of the google images engine.

	.. admonition:: Content-Security-Policy (CSP)

	This engine needs to allow images from the `data URLs`_ (prefixed with the
	``data:`` scheme)::

	Header set Content-Security-Policy "img-src 'self' data: ;"

	.. _data URLs:
	https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
	"""

	from urllib.parse import urlencode
	import re
	import json
	from lxml import html

	from searx.utils import eval_xpath_getindex
	from searx.engines.google import (
	get_lang_info,
	time_range_dict,
	detect_google_sorry,
	)

	# pylint: disable=unused-import
	from searx.engines.google import supported_languages_url, _fetch_supported_languages

	# pylint: enable=unused-import

	# about
	about = {
	"website": 'https://images.google.com',
	"wikidata_id": 'Q521550',
	"official_api_documentation": 'https://developers.google.com/custom-search',
	"use_official_api": False,
	"require_api_key": False,
	"results": 'HTML',
	}

	# engine dependent config
	categories = ['images', 'web']
	paging = False
	use_locale_domain = True
	time_range_support = True
	safesearch = True

	filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
	quote_keys_regex = re.compile(r'([\{\s,])(\w+)(:)')


	def js_variable_to_python(js_variable):
	"""Convert a javascript variable into JSON and then load the value"""
	# when in_string is not None, it contains the character that has opened the string
	# either simple quote or double quote
	in_string = None
	# cut the string:
	# r"""{ a:"f\"irst", c:'sec"ond'}"""
	# becomes
	# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
	l = re.split(r'(["\'])', js_variable)
	# previous part (to check the escape character antislash)
	previous_p = ""
	for i, p in enumerate(l):
	# parse characters inside a ECMA string
	if in_string:
	# we are in a JS string: replace the colon by a temporary character
	# so quote_keys_regex doesn't have to deal with colon inside the JS strings
	l[i] = l[i].replace(':', chr(1))
	if in_string == "'":
	# the JS string is delimited by simple quote.
	# This is not supported by JSON.
	# simple quote delimited string are converted to double quote delimited string
	# here, inside a JS string, we escape the double quote
	l[i] = l[i].replace('"', r'\"')

	# deal with delimieters and escape character
	if not in_string and p in ('"', "'"):
	# we are not in string
	# but p is double or simple quote
	# that's the start of a new string
	# replace simple quote by double quote
	# (JSON doesn't support simple quote)
	l[i] = '"'
	in_string = p
	continue
	if p == in_string:
	# we are in a string and the current part MAY close the string
	if len(previous_p) > 0 and previous_p[-1] == '\\':
	# there is an antislash just before: the ECMA string continue
	continue
	# the current p close the string
	# replace simple quote by double quote
	l[i] = '"'
	in_string = None
	# update previous_p
	previous_p = p
	# join the string
	s = ''.join(l)
	# add quote arround the key
	# { a: 12 }
	# becomes
	# { "a": 12 }
	s = quote_keys_regex.sub(r'\1"\2"\3', s)
	# replace the surogate character by colon
	s = s.replace(chr(1), ':')
	# load the JSON and return the result
	return json.loads(s)


	def request(query, params):
	"""Google-Video search request"""

	lang_info = get_lang_info(params, supported_languages, language_aliases, False)
	logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])

	query_url = (
	'https://'
	+ lang_info['subdomain']
	+ '/search'
	+ "?"
	+ urlencode(
	{
	'q': query,
	'tbm': "isch",
	**lang_info['params'],
	'ie': "utf8",
	'oe': "utf8",
	'num': 30,
	}
	)
	)

	if params['time_range'] in time_range_dict:
	query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
	if params['safesearch']:
	query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
	params['url'] = query_url

	params['headers'].update(lang_info['headers'])
	params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
	return params


	def force_https(url):
	if url.startswith('http://'):
	return 'https://' + url[7:]
	return url


	def response(resp):
	"""Get response from google's search request"""
	results = []

	detect_google_sorry(resp)

	# convert the text to dom
	dom = html.fromstring(resp.text)
	img_src_script = eval_xpath_getindex(dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
	data = js_variable_to_python(img_src_script.strip().replace('AF_initDataCallback(', '')[:-2])
	# there are not suggestions, rather subqueries
	#for suggestion in data['data'][1][0][0][1]:
	# results.append({'suggestion': suggestion[0]})
	for img in data['data'][31][0][12][2]:
	if not isinstance(img, list) or len(img) == 0:
	continue
	if not isinstance(img[1], list) or len(img[1]) == 0:
	continue
	if len(img[1]) < 9:
	continue
	if not isinstance(img[1][9].get("2003"), list):
	continue

	img_1_9_2003 = img[1][9]["2003"]
	img_fullsize = img[1][3]
	img_thumbnail = img[1][2]
	results.append(
	{
	'url': img_1_9_2003[2],
	'title': img_1_9_2003[3],
	'source': img_1_9_2003[12],
	'img_format': str(img_fullsize[2]) + ' x ' + str(img_fullsize[1]),
	'img_src': force_https(img_fullsize[0]),
	'thumbnail_src': force_https(img_thumbnail[0]),
	'template': 'images.html',
	}
	)
	return results