Posted on Jul 21, 2022 • Edited on Oct 3, 2022 • Originally published at serpapi.com

Web Scraping Google Images and Saving locally with Python

#python #programming #webscraping #tutorial

What will be scraped

Full Code

import os, requests, lxml, re, json, urllib.request from bs4 import BeautifulSoup from serpapi import GoogleSearch headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36" } params = { "q": "mincraft wallpaper 4k", # search query  "tbm": "isch", # image results  "hl": "en", # language of the search  "gl": "us", # country where search comes from  "ijn": "0" # page number } html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30) soup = BeautifulSoup(html.text, "lxml") def get_images_with_request_headers(): del params["ijn"] params["content-type"] = "image/png" # parameter that indicate the original media type  return [img["src"] for img in soup.select("img")] def get_suggested_search_data(): suggested_searches = [] all_script_tags = soup.select("script") # https://regex101.com/r/48UZhY/6  matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags))) # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes  # if you try to json.loads() without json.dumps it will throw an error:  # "Expecting property name enclosed in double quotes"  matched_images_data_fix = json.dumps(matched_images) matched_images_data_json = json.loads(matched_images_data_fix) # search for only suggested search thumbnails related  # https://regex101.com/r/ITluak/2  suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json)) # https://regex101.com/r/MyNLUk/1  suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails) for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded): suggested_searches.append({ "name": suggested_search.select_one(".VlHyHc").text, "link": f"https://www.google.com{suggested_search.a['href']}", # https://regex101.com/r/y51ZoC/1  "chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])), # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi  "thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape") }) return suggested_searches def get_original_images(): """ https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes if you try to json.loads() without json.dumps() it will throw an error: "Expecting property name enclosed in double quotes" """ google_images = [] all_script_tags = soup.select("script") # # https://regex101.com/r/48UZhY/4  matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags))) matched_images_data_fix = json.dumps(matched_images_data) matched_images_data_json = json.loads(matched_images_data_fix) # https://regex101.com/r/VPz7f2/1  matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json) # https://regex101.com/r/NnRg27/1  matched_google_images_thumbnails = ", ".join( re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', str(matched_google_image_data))).split(", ") thumbnails = [ bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails ] # removing previously matched thumbnails for easier full resolution image matches.  removed_matched_google_images_thumbnails = re.sub( r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data)) # https://regex101.com/r/fXjfb1/4  # https://stackoverflow.com/a/19821774/15164646  matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails) full_res_images = [ bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images ] for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1): google_images.append({ "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"], "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"], "source": metadata.select_one(".fxgdke").text, "thumbnail": thumbnail, "original": original }) # Download original images  print(f'Downloading {index} image...') opener=urllib.request.build_opener() opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve(original, f'Bs4_Images/original_size_img_{index}.jpg') return google_images

Prerequisites

Install libraries:

pip install requests bs4 google-search-results

google-search-results is a SerpApi API package that will be shown at the end as an alternative solution.

Basic knowledge scraping with CSS selectors

CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine
about how to use CSS selectors when web-scraping that covers what it is, its pros and cons, and why they matter from a web-scraping perspective.

Reduce the chance of being blocked

There's a chance that a request might be blocked. Have a look
at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.

Make sure to pass User-Agent, because Google might block your requests eventually and you'll receive a different HTML thus empty output.

User-Agent identifies the browser, its version number, and its host operating system that represents a person (browser) in a Web context that lets servers and network peers identify if it's a bot or not. And we're faking "real" user visit. Check what is your user-agent.

Code Explanation

Import libraries:

import os, requests, lxml, re, json, urllib.request from bs4 import BeautifulSoup from serpapi import GoogleSearch

Library	Purpose
`os`	to return environment variable (SerpApi API key) value.
`requests`	to make a request to the website.
`lxml`	to process XML/HTML documents fast.
`json`	to convert extracted data to a JSON object.
`re`	to extract parts of the data via regular expression.
`urllib.request`	to save images locally with `urllib.request.urlretrieve`
`BeautifulSoup`	is a XML/HTML scraping library. It's used in combo with `lxml` as it faster than `html.parser`

Create URL parameter and request headers:

headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36" } params = { "q": "mincraft wallpaper 4k", # search query  "tbm": "isch", # image results  "hl": "en", # language of the search  "gl": "us", # country where search comes from  "ijn": "0" # page number }

Code	Explanation
`params`	a prettier way of passing URL parameters to a request.
`user-agent`	to act as a "real" user request from the browser by passing it to request headers. Default `requests` user-agent is a `python-reqeusts` so websites might understand that it's a bot or a script and block the request to the website. Check what's your `user-agent`.

Make a request, pass created request parameters and headers. Pass returned HTML to BeautifulSoup:

html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30) soup = BeautifulSoup(html.text, "lxml")

Code	Explanation
`timeout=30`	to stop waiting for response after 30 seconds.
`BeautifulSoup(html.text, "lxml")`	`html.text` will return a textual HTML data and `"lxml"` will be set as a XML/HTML processor, not the default `html.parser`

Extracting data with request headers only, no regular expression the moment:

def get_images_with_request_headers(): params["content-type"] = "image/png" # parameter that indicate the original media type  return [img["src"] for img in soup.select("img")]

The reason why it's handy is beacuse when you try directly parse data from img tag and src attriubte, you'll get a base64 encoded URL which will be a 1x1 image placeholder. Not a particularly useful image resolution 🙂

Code	Explanation
`params["content-type"]`	will create a new `dict` key `"content-type"` and assinged a `"image/png"` value which will return images.
`[img["src"] for img in soup.select("img")]`	will iterate over all `img` tags and extracts `src` attriubte in a list comprehension loop and returned value would be a `list` of URLs from `src` attriubte.

Print returned data:

[ "/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQIMPdHAO0x25OT-1uxKUw_Kh1Bct6RIDaHlL8fTqXY_qGNdAizUZGa_uI6_Q&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSvj_A0yXqbEgcTsZ1_ckFjtTEVCxn9BFJF6EYh1MbLiWokT3EdvPo0aB3aeQ&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcThOaMoYjrJCat0kGBaecZVz1pOXsntuvwyexmedIsR4gFXtek-3rNmBlL8fQ&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTuUcXHuHe2qOVv_IfHmk3A8T24VeWT-qHfRrHaEUHH89MGlH8r2NJXHHZ-Yg&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSan-UQMuDyVQI5ZEiyubOle_0q_PgXL7DpBgKq8Y2-Fuc1BM1X5MxMBiP4ag&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQfWQp1ltZK-o_PZ1f2rRtSq0MoWx-0jLFh_y1uv8umjK4eSj6IqhqRBzCKtg&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRn4d2zWI7_4wInrzsNntRSWcA_neicVq63ZJdiiYcsEERogyw542JmFugEc4U&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTDtyIKB9SH1rg9U4kcrGlD_La_NCcveaOD4UX1EaXUxkW-L0DNhcsLX5obpQ&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRg2xAc8_Wdv5Cheb9w6Xq_e7X1fV7tdUCi9F-PsbHGlaJ2dLHSfYUgPXHmIw&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSmdAeFWxFVq7gpGtkEqq_ZzexHslYOGrHxW-IUgeOXv4CuIygDDyhaqSnPSy8&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQHeNDVL9JbWgD8ypdDR4XTr6xY7dRgh_n2_5q8GZQuiqlk_oDCHaOC0bu0Hg&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRa5NpjrrZl0US_8TFHdO5d3SQ4TF_dn6MvRzL4SwzB_KcemJWdPRHJtGnTOw&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSbRM8yNQyOF4EZsYZCtu7W4mWOIVPfaGySwmyfA6eeJfMySK4clUTvdrMg-g&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRvBJ8a7VeA8JLlqI_p4oP2fx7oXzYMVCQEPG_Pg_ez0DvAwk7-aCwM4_U7vAA&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSHNhOWeEKm5LbrrpPBzbfve0V9YFLTYMhxwIDtxizdLQksWhsqIpoQ1UsmaA&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRKeLIuwzSKazmbf4pUxXxosJf1yfacVk0YAmghMI9tvU1UgVeRKp1RYgxqcEY&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ7dBADGlYdi8qGZ9EiAEqFp0V0CQlvMTbczsJShx0qwNV0aM-BKAR33bsTUTg&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIjBkxu1ZJH5Nl_Gnr1U24VDDXPJ8HmjQ4GltNgIr0An3sHmzKUYafNp03qA&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS0JKR9RIPl5boovVxH9oSNPJzkIyQB1RhdoJSHyi7ScUVPM2T6I3N0r1awxQ&s", "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmUMz6FrjQ8YF9Ktxz1ftVKAnNxmTs5ACqDpJ0z_4ppcYBaWWGCyfd-GZLjQ&s" ]

Now to the suggested search results, a thing above actual images:

def get_suggested_search_data(): """ https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes if you try to json.loads() without json.dumps it will throw an error: "Expecting property name enclosed in double quotes" """ suggested_searches = [] all_script_tags = soup.select("script") # https://regex101.com/r/48UZhY/6  matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags))) matched_images_data_fix = json.dumps(matched_images) matched_images_data_json = json.loads(matched_images_data_fix) # search for only suggested search thumbnails related  # https://regex101.com/r/ITluak/2  suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json)) # https://regex101.com/r/MyNLUk/1  suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails) # zip() is used on purpose over zip_longest() as number of results would be identical  for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded): suggested_searches.append({ "name": suggested_search.select_one(".VlHyHc").text, "link": f"https://www.google.com{suggested_search.a['href']}", # https://regex101.com/r/y51ZoC/1  "chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])), # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi  "thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape") }) return suggested_searches

Code	Explanation
`suggested_searches`	a temporary `list` where extracted data will be appended at the end of the function.
`all_script_tags`	a variable which will hold all extracted `<script>` HTML tags from `soup.select("script")` where `select()` will return a list of matched `<script>` tags.
`matched_images`	will hold all extracted matched images data from `re.findall()` which returns an iterator. This variable is needed to extract suggested search thumbnails, image thumbnails and full-resolution images.
`suggested_search_thumbnails` and `suggested_search_thumbnail_encoded`	parses part of inline JSON where `suggested_search_thumbnail_encoded` parses actual thumbnails from partly parsed inline JSON data.
`zip()`	to iterate over multiple iterables in parralel. Keep in mind that `zip` is used on purpose. `zip()` ends with the shortest iterator while `zip_longest()` iterates up to the length of the longest iterator.
`suggested_searches.append({})`	to `append` extracted images data to a `list` as a dictionary.
`select_one()`	to return one (instead of all) matched element in a loop.
`["href"]`	is a shortcut of accessing and extracting HTML attributes with `BeautifulSoup`. Alternative is `get(<attribute>)`.
`"".join()`	to join all items from in iterable into a string.
`bytes(<variable>, "ascii").decode("unicode-escape")`	to decode parsed image data.

Printed returned data:

[ { "name": "ultra hd", "link": "https://www.google.com/search?q=minecraft+wallpaper+4k&tbm=isch&hl=en&gl=us&chips=q:minecraft+wallpaper+4k,g_1:ultra+hd:5VuluDYWa8Y%3D&sa=X&ved=2ahUKEwjshdCK0Yn5AhXrlWoFHYhyCrQQ4lYoAHoECAEQHQ", "chips": "q:minecraft+wallpaper+4k,g_1:ultra+hd:5VuluDYWa8Y%3D", "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcThU0xo_GeIciyaBmvE6EI46tnj0npeDAmDsLKjYlnv4tGz0eaw&usqp=CAU" }, { "name": "epic", "link": "https://www.google.com/search?q=minecraft+wallpaper+4k&tbm=isch&hl=en&gl=us&chips=q:minecraft+wallpaper+4k,g_1:epic:5c56RYLjq2c%3D&sa=X&ved=2ahUKEwjshdCK0Yn5AhXrlWoFHYhyCrQQ4lYoAXoECAEQHw", "chips": "q:minecraft+wallpaper+4k,g_1:epic:5c56RYLjq2c%3D", "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ_bUq-7tk9FyeNSW40Yo8FRY6SOViMbUeme_ln1uMwxcTdfI6d&usqp=CAU" }, ... other results ]

Extracting original resolution images:

def get_original_images(): """ https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes if you try to json.loads() without json.dumps() it will throw an error: "Expecting property name enclosed in double quotes" """ google_images = [] all_script_tags = soup.select("script") # # https://regex101.com/r/48UZhY/4  matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags))) matched_images_data_fix = json.dumps(matched_images_data) matched_images_data_json = json.loads(matched_images_data_fix) # https://regex101.com/r/VPz7f2/1  matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json) # https://regex101.com/r/NnRg27/1  matched_google_images_thumbnails = ", ".join( re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', str(matched_google_image_data))).split(", ") thumbnails = [ bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails ] # removing previously matched thumbnails for easier full resolution image matches.  removed_matched_google_images_thumbnails = re.sub( r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data)) # https://regex101.com/r/fXjfb1/4  # https://stackoverflow.com/a/19821774/15164646  matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails) full_res_images = [ bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images ] for index, (metadata, thumbnail, original) in enumerate(zip(soup.select(".isv-r.PNCib.MSM1fd.BUooTd"), thumbnails, full_res_images), start=1): google_images.append({ "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"], "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"], "source": metadata.select_one(".fxgdke").text, "thumbnail": thumbnail, "original": original }) # Download original images  print(f"Downloading {index} image...") opener=urllib.request.build_opener() opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")] urllib.request.install_opener(opener) urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg") return google_images

The process is almost identical to extracting suggested search results except for different regular expressions:

1. Create a temporary list google_images where extracted data will be appended.

2. Extracting all_script_tags.

3. Extracting matched_images_data to extract thumbnails and original resolution images.

4. Decode extracted encoded thumbnails:

thumbnails = [ bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails ] # equvalent to for fixed_google_image_thumbnail in matched_google_images_thumbnails: # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi  google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape') # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.  google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')

5. Decode extracted encoded full_res_images:

full_res_images = [ bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images ] # equvalent to for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images): # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi  original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape') original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')

Save full resolution images locally:

opener=urllib.request.build_opener() opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")] urllib.request.install_opener(opener) urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")

Code	Explanation
`urllib.request.build_opener()`	manages the chaining of handlers and will automatically add headers on each request (row below).
`opener.addheaders[()]`	to add headers to the request.
`urllib.install_opener()`	set opener as a default global opener. Whatever that means 👀
`urllib.request.urlretrieve()`	to save images locally.

Printed returned data:

[ { "title": "4K Minecraft Wallpapers | Background Images", "link": "https://wall.alphacoders.com/tag/4k-minecraft-wallpapers", "source": "wall.alphacoders.com", "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSJxrGh1FUsvCRNgKI4aiM8CimALQ0rHU2SDigSRl6X1c7BiWDOUMMMVCwyKtufB9SEddw&usqp=CAU", "original": "https://images6.alphacoders.com/108/thumb-1920-1082090.jpg" }, { "title": "Best Minecraft Wallpaper 4k - Minecraft Tutos", "link": "https://minecraft-tutos.com/en/minecraft-wallpaper/", "source": "minecraft-tutos.com", "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRDMguXava6khO5e5A0GQsm5v64rrJI_tYuSaJjyxWQNhTrhRWPRLLuhtPVouOUSaqzC0&usqp=CAU", "original": "https://minecraft-tutos.com/wp-content/uploads/2022/03/wallpaper-minecraft-alex-steve-universe.jpeg" }, ... other results ]

Using Google Images API

The main difference is that it's a quicker approach. No need to figure out regular expressions, create a parser and maintain it over time, or how to scale the number of requests without being blocked.

Example with pagination and multiple search queries:

def serpapi_get_google_images(): image_results = [] for query in ["Coffee", "boat", "skyrim", "minecraft"]: # search query parameters  params = { "engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu...  "q": query, # search query  "tbm": "isch", # image results  "num": "100", # number of images per page  "ijn": 0, # page number: 0 -> first page, 1 -> second...  "api_key": os.getenv("API_KEY") # your serpapi api key  # other query parameters: hl (lang), gl (country), etc  } search = GoogleSearch(params) # where data extraction happens  images_is_present = True while images_is_present: results = search.get_dict() # JSON -> Python dictionary  # checks for "Google hasn't returned any results for this query."  if "error" not in results: for image in results["images_results"]: if image["original"] not in image_results: image_results.append(image["original"]) # update to the next page  params["ijn"] += 1 else: images_is_present = False print(results["error"]) # -----------------------  # Downloading images  for index, image in enumerate(results["images_results"], start=1): print(f"Downloading {index} image...") opener=urllib.request.build_opener() opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")] urllib.request.install_opener(opener) urllib.request.urlretrieve(image["original"], f"SerpApi_Images/original_size_img_{index}.jpg") print(json.dumps(image_results, indent=2)) print(len(image_results))

Outputs:

[ "https://i.ytimg.com/vi/ZXMgC-HuvIk/maxresdefault.jpg", "https://www.minecraft.net/content/dam/games/minecraft/key-art/redeem-art-minecraft-285x380.jpg", "https://i.ytimg.com/vi/yZ_Ppfg886A/maxresdefault.jpg", "https://www.minecraft.net/content/dam/games/minecraft/screenshots/1-18-2-release-header.jpg", "https://i.ytimg.com/vi/vdrn4ouZRvQ/maxresdefault.jpg", "https://cdn.shopify.com/s/files/1/0266/4841/2351/products/MCMATTEL-PLUSH-BUNDLE-Minecraft-PlushImage-1080x1080_1_1024x1024.jpg?v=1647522411", "https://i.ytimg.com/vi/LMCt-gSvEqU/maxresdefault.jpg", "https://www.minecraft.net/content/dam/community/events/cy21/minecraft-live-2021/Minecraft_Live_2021_PMP_Hero_01.jpg", "https://i.ytimg.com/vi/yCNUP2NAt-A/maxresdefault.jpg", "https://yt3.ggpht.com/WSL98T4k4vjvwzFjtIk_tQfTGu7ak0mTRiUnF1djjhevjEX4SNW9LOiY5534JKOzSYlehght0w=s540-w390-h540-c-k-c0x00ffffff-no-nd-rj", "https://i.ytimg.com/vi/rrl8-jOOlIA/maxresdefault.jpg", "https://i.ytimg.com/vi/f8LJloSamwg/maxresdefault.jpg", "https://i.ytimg.com/vi/IqtMhWqv_pw/maxresdefault.jpg", "https://i.ytimg.com/vi/076mjMOL6R8/maxresdefault.jpg", "https://i.ytimg.com/vi/5qrUb7a821c/maxresdefault.jpg", "https://i.ytimg.com/vi/HZGffLRh6a4/maxresdefault.jpg", "https://i.ytimg.com/vi/nFQKvjM9HCw/maxresdefault.jpg", "https://i.ytimg.com/vi/LK4w3PwdCWc/maxresdefault.jpg", "https://i.ytimg.com/vi/hySgv7XyWaM/maxresdefault.jpg", "https://i.ytimg.com/vi/rmkGOy7pS4I/maxresdefault.jpg", "https://i.ytimg.com/vi/YV-576jC1BU/maxresdefault.jpg", "https://i.ytimg.com/vi/YXY74kWderc/maxresdefault.jpg" ] 2349 # number of total extracted images

Links

Join us on Twitter | YouTube

Add a Feature Request💫 or a Bug🐞

Top comments (2)

mohammadparsa-javidi • Jul 24 '22

perfect❤

Dmitriy Zub ☀️ • Jul 25 '22

Thank you💖