Skip to content

Instantly share code, notes, and snippets.

@PepeRulo
Forked from stephenhouser/Bing-Image-Scraper
Created November 19, 2019 02:19
Show Gist options
  • Save PepeRulo/b9454d756e3b5f1f7f98668b97fd56e1 to your computer and use it in GitHub Desktop.
Save PepeRulo/b9454d756e3b5f1f7f98668b97fd56e1 to your computer and use it in GitHub Desktop.
Bing-Image-Scraper

Bing Image Scraper Example

Example using Python to query and scrape Microsoft Bing image search.

Requirements

BeautifulSoup and requests packages are required to run this example.

If you are using the command line and pip you should be able to install these with:

pip install bs4
pip install requests

Running

To run, simply give the script an image search term. It will bing search for it and drop the discovered images into the Pictures subdirectory. In the following example we look for images of kittens.

python bing_image.py kitten
Bing image scraper example using Python to query and scrape Microsoft Bing image search.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
import re
import sys
import os
import http.cookiejar
import json
import urllib.request, urllib.error, urllib.parse
def get_soup(url,header):
#return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),
# 'html.parser')
return BeautifulSoup(urllib.request.urlopen(
urllib.request.Request(url,headers=header)),
'html.parser')
query = sys.argv[1]
query= query.split()
query='+'.join(query)
url="http://www.bing.com/images/search?q=" + query + "&FORM=HDRSC2"
#add the directory for your image here
DIR="Pictures"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("a",{"class":"iusc"}):
#print a
mad = json.loads(a["mad"])
turl = mad["turl"]
m = json.loads(a["m"])
murl = m["murl"]
image_name = urllib.parse.urlsplit(murl).path.split("/")[-1]
print(image_name)
ActualImages.append((image_name, turl, murl))
print("there are total" , len(ActualImages),"images")
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
##print images
for i, (image_name, turl, murl) in enumerate(ActualImages):
try:
#req = urllib2.Request(turl, headers={'User-Agent' : header})
#raw_img = urllib2.urlopen(req).read()
#req = urllib.request.Request(turl, headers={'User-Agent' : header})
raw_img = urllib.request.urlopen(turl).read()
cntr = len([i for i in os.listdir(DIR) if image_name in i]) + 1
#print cntr
f = open(os.path.join(DIR, image_name), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print("could not load : " + image_name)
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment