Skip to content

Instantly share code, notes, and snippets.

Created December 15, 2016 17:20
Show Gist options
  • Save tomvangoethem/9dfe2d26006ce34bad8ff492bd1fa435 to your computer and use it in GitHub Desktop.
Save tomvangoethem/9dfe2d26006ce34bad8ff492bd1fa435 to your computer and use it in GitHub Desktop.
import requests
import urllib
import urlparse
# # Search for URLs using Bing's Congitive API
# # Usage:
# > b = Bing('subscription_key')
# >'keyword')
# # Result:
# ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
class Bing(object):
def __init__(self, subscription_key):
self.key = subscription_key
def unwind_entry(self, entry, retries=2):
parsed = urlparse.urlparse(entry['url'])
if '' in parsed.netloc:
response = requests.head(entry['url'], allow_redirects=False, stream=True)
if not response:
raise Exception('No valid response')
new_location = response.headers.get('Location', '')
parsed = urlparse.urlparse(new_location)
if parsed.netloc == '':
raise Exception('Invalid redirect location.')
entry['url'] = new_location
return entry
if retries > 0:
return self.unwind_entry(entry, retries - 1)
return None
def search(self, keyword, market='en-US', count=50, urls_only=True, unwind_bing_urls=True):
continue_querying = True
offset = 0
all_search_results = []
while continue_querying:
num_to_search = min(Bing.MAX_COUNT_PER_QUERY, count)
search_results = self.get_search_results(keyword, market, num_to_search, offset)
if search_results is not None and 'webPages' in search_results:
if len(search_results['webPages'].get('value', ())) > 0:
# Most likely there are no more results
continue_querying = False
offset += num_to_search
# We've queried for enough results
if search_results['webPages'].get('totalEstimatedMatches', 0) <= offset:
continue_querying = False
# Failsafe: number of results should not exceed count
if len(all_search_results) >= count:
all_search_results = all_search_results[:count]
continue_querying = False
# Failsafe 2: do not return more than 1000 results
if offset >= 1000:
continue_querying = False
continue_querying = False
all_search_results = filter(lambda x: x.get('url', False), all_search_results)
if unwind_bing_urls:
all_search_results = map(self.unwind_entry, all_search_results)
if urls_only:
return [entry['url'] for entry in all_search_results if entry is not None]
return all_search_results
def get_search_results(self, keyword, market='en-US', count=50, offset=0, retries=2):
params = {
"q": keyword,
"responseFilter": "webpages",
"safeSearch": "Off",
"mkt": market,
"count": count,
"offset": offset
r = requests.get('%s?%s' % (Bing.SEARCH_ENDPOINT, urllib.urlencode(params)),
headers={'Ocp-Apim-Subscription-Key': self.key})
result = r.json()
if result.get('_type') != 'SearchResponse':
raise Exception('Not a valid response')
return result
if retries > 0:
return self.get_search_results(keyword, market, count, offset, retries - 1)
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment