Last active
April 28, 2024 11:34
-
-
Save dimitryzub/9d1c5de0613610a02e3fdc96e05e86a1 to your computer and use it in GitHub Desktop.
Web scraping all Google Images in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Step-by-step blog post: https://serpapi.com/blog/scrape-google-images-with-python/ | |
# There's an API solution with a video tutorial: https://www.youtube.com/watch?v=QuCPV6_GT6o | |
import os, requests, lxml, re, json, urllib.request | |
from bs4 import BeautifulSoup | |
from serpapi import GoogleSearch | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36" | |
} | |
params = { | |
"q": "mincraft wallpaper 4k", # search query | |
"tbm": "isch", # image results | |
"hl": "en", # language of the search | |
"gl": "us", # country where search comes from | |
"ijn": "0" # page number | |
} | |
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30) | |
soup = BeautifulSoup(html.text, "lxml") | |
def get_images_with_request_headers(): | |
params["content-type"] = "image/png" # parameter that indicate the original media type | |
return [img["src"] for img in soup.select("img")] | |
def get_suggested_search_data(): | |
suggested_searches = [] | |
all_script_tags = soup.select("script") | |
# https://regex101.com/r/48UZhY/6 | |
matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags))) | |
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes | |
# if you try to json.loads() without json.dumps it will throw an error: | |
# "Expecting property name enclosed in double quotes" | |
matched_images_data_fix = json.dumps(matched_images) | |
matched_images_data_json = json.loads(matched_images_data_fix) | |
# search for only suggested search thumbnails related | |
# https://regex101.com/r/ITluak/2 | |
suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json)) | |
# https://regex101.com/r/MyNLUk/1 | |
suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails) | |
for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded): | |
suggested_searches.append({ | |
"name": suggested_search.select_one(".VlHyHc").text, | |
"link": f"https://www.google.com{suggested_search.a['href']}", | |
# https://regex101.com/r/y51ZoC/1 | |
"chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])), | |
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi | |
"thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape") | |
}) | |
return suggested_searches | |
def get_original_images(): | |
""" | |
https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes | |
if you try to json.loads() without json.dumps() it will throw an error: | |
"Expecting property name enclosed in double quotes" | |
""" | |
google_images = [] | |
all_script_tags = soup.select("script") | |
# # https://regex101.com/r/48UZhY/4 | |
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags))) | |
matched_images_data_fix = json.dumps(matched_images_data) | |
matched_images_data_json = json.loads(matched_images_data_fix) | |
# https://regex101.com/r/VPz7f2/1 | |
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json) | |
# https://regex101.com/r/NnRg27/1 | |
matched_google_images_thumbnails = ", ".join( | |
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', | |
str(matched_google_image_data))).split(", ") | |
thumbnails = [ | |
bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails | |
] | |
# removing previously matched thumbnails for easier full resolution image matches. | |
removed_matched_google_images_thumbnails = re.sub( | |
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data)) | |
# https://regex101.com/r/fXjfb1/4 | |
# https://stackoverflow.com/a/19821774/15164646 | |
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails) | |
full_res_images = [ | |
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images | |
] | |
for metadata, thumbnail, original in zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images): | |
google_images.append({ | |
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"], | |
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"], | |
"source": metadata.select_one(".fxgdke").text, | |
"thumbnail": thumbnail, | |
"original": original | |
}) | |
# Download original images | |
# print(f'Downloading {index} image...') | |
# opener=urllib.request.build_opener() | |
# opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36')] | |
# urllib.request.install_opener(opener) | |
# urllib.request.urlretrieve(original, f'Bs4_Images/original_size_img_{index}.jpg') | |
return google_images | |
def serpapi_get_google_images(): | |
image_results = [] | |
for query in ["Coffee", "boat", "skyrim", "minecraft"]: | |
params = { | |
"engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu... | |
"q": query, # search query | |
"tbm": "isch", # image results | |
"num": "100", # number of images per page | |
"ijn": 0, # page number: 0 -> first page, 1 -> second... | |
"api_key": os.getenv("API_KEY") # your serpapi api key | |
# other query parameters: hl (lang), gl (country), etc | |
} | |
search = GoogleSearch(params) # where data extraction happens | |
images_is_present = True | |
while images_is_present: | |
results = search.get_dict() # JSON -> Python dictionary | |
# checks for "Google hasn't returned any results for this query." | |
if "error" not in results: | |
for image in results["images_results"]: | |
if image["original"] not in image_results: | |
print(image["original"]) | |
image_results.append(image["original"]) | |
# update to the next page | |
params["ijn"] += 1 | |
else: | |
images_is_present = False | |
print(results["error"]) | |
# ----------------------- | |
# Downloading images | |
# for index, image in enumerate(results['images_results']): | |
# print(f'Downloading {index} image...') | |
# opener=urllib.request.build_opener() | |
# opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.3538.102 Safari/537.36 Edge/18.19582')] | |
# urllib.request.install_opener(opener) | |
# urllib.request.urlretrieve(image['original'], f'SerpApi_Images/original_size_img_{index}.jpg') |
@MatheusRodriguesEsoft I no longer use it nor maintain it 🙂
If it doesn't work for you, most likely it's because of CSS selectors change, or regex.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script still works for you?