Skip to content

Instantly share code, notes, and snippets.

@tomvangoethem
Created December 15, 2016 17:20
Show Gist options
  • Save tomvangoethem/9dfe2d26006ce34bad8ff492bd1fa435 to your computer and use it in GitHub Desktop.
Save tomvangoethem/9dfe2d26006ce34bad8ff492bd1fa435 to your computer and use it in GitHub Desktop.
import requests
import urllib
import urlparse
requests.packages.urllib3.disable_warnings()
# # Search for URLs using Bing's Congitive API
# # Usage:
#
# > b = Bing('subscription_key')
# > b.search('keyword')
#
# # Result:
#
# ['https://adwords.google.com/KeywordPlanner', 'http://keywordtool.io/', 'https://en.wikipedia.org/wiki/Keyword', 'http://www.googlekeywordtool.com/', 'http://www.dictionary.com/browse/keyword', 'https://msdn.microsoft.com/en-us/library/x53a06bb.aspx', 'https://moz.com/beginners-guide-to-seo/keyword-research', 'http://www.keyworddiscovery.com/', 'https://moz.com/products/pro/keyword-explorer', 'https://blog.google/', 'http://www.thefreedictionary.com/keyword', 'http://keyword.com/', 'http://www.webopedia.com/TERM/K/keyword.html', 'http://www.wordstream.com/seo-keyword', 'http://www.imdb.com/search/keyword/', 'http://www.wordstream.com/keyword', 'https://support.google.com/adwords/answer/2999770?hl=en', 'http://www.webconfs.com/seo-tools/keyword-density-checker/', 'http://www.bing.com/toolbox/keywords/', 'https://knowledge.hubspot.com/keyword-user-guide-v2/understanding-keywords', 'https://www.lifewire.com/keywords-and-how-they-rank-3482895', 'https://helpx.adobe.com/lightroom/help/keywords.html', 'https://www.wordtracker.com/', 'https://blog.hubspot.com/marketing/how-to-do-keyword-research-ht', 'http://www.keyword.io/', 'https://www.techopedia.com/definition/1215/keyword-seo', 'http://www.internetmarketingninjas.com/blog/search-engine-optimization/12-free-keyword-tools-replace-googles-keyword-tool/', 'https://en.wikipedia.org/wiki/Reserved_word', 'http://searchengineland.com/threefree-keyword-tool-reviews-126217', 'http://www.royalgames.com/games/word-games/keyword/?language=en_US', 'https://msdn.microsoft.com/en-us/library/dk1507sz.aspx', 'http://keywordtool.io/youtube', 'http://webdesign.about.com/od/seo/a/keywords-html.htm', 'http://www.w3schools.com/tags/tag_meta.asp', 'https://ahrefs.com/blog/keyword-research/', 'http://www.columbia.edu/cu/lweb/help/clio/keyword.html', 'http://backlinko.com/keyword-research', 'http://contentmarketinginstitute.com/2016/04/proper-keyword-research/', 'http://www.thesaurus.com/browse/keyword', 'http://tools.seobook.com/keyword-tools/seobook/', 'http://www.dictionary.com/browse/keywords', 'http://www.keywordspy.com/', 'https://www.biblegateway.com/keyword/', 'http://www.webopedia.com/TERM/K/keyword_search.html', 'http://www.yourdictionary.com/keyword', 'http://webdesign.about.com/od/seo/a/keywords-html.htm', 'http://www.keyworddiscovery.com/search.html', 'http://tools.seobook.com/keyword-list/', 'https://ubersuggest.io/', 'http://lsigraph.com/']
class Bing(object):
SEARCH_ENDPOINT = 'https://api.cognitive.microsoft.com/bing/v5.0/search'
MAX_COUNT_PER_QUERY = 50
def __init__(self, subscription_key):
self.key = subscription_key
def unwind_entry(self, entry, retries=2):
try:
parsed = urlparse.urlparse(entry['url'])
if 'bing.com' in parsed.netloc:
response = requests.head(entry['url'], allow_redirects=False, stream=True)
if not response:
raise Exception('No valid response')
new_location = response.headers.get('Location', '')
parsed = urlparse.urlparse(new_location)
if parsed.netloc == '':
raise Exception('Invalid redirect location.')
entry['url'] = new_location
return entry
except:
if retries > 0:
return self.unwind_entry(entry, retries - 1)
return None
def search(self, keyword, market='en-US', count=50, urls_only=True, unwind_bing_urls=True):
continue_querying = True
offset = 0
all_search_results = []
while continue_querying:
num_to_search = min(Bing.MAX_COUNT_PER_QUERY, count)
search_results = self.get_search_results(keyword, market, num_to_search, offset)
if search_results is not None and 'webPages' in search_results:
if len(search_results['webPages'].get('value', ())) > 0:
all_search_results.extend(search_results['webPages']['value'])
else:
# Most likely there are no more results
continue_querying = False
offset += num_to_search
# We've queried for enough results
if search_results['webPages'].get('totalEstimatedMatches', 0) <= offset:
continue_querying = False
# Failsafe: number of results should not exceed count
if len(all_search_results) >= count:
all_search_results = all_search_results[:count]
continue_querying = False
# Failsafe 2: do not return more than 1000 results
if offset >= 1000:
continue_querying = False
else:
continue_querying = False
all_search_results = filter(lambda x: x.get('url', False), all_search_results)
if unwind_bing_urls:
all_search_results = map(self.unwind_entry, all_search_results)
if urls_only:
return [entry['url'] for entry in all_search_results if entry is not None]
return all_search_results
def get_search_results(self, keyword, market='en-US', count=50, offset=0, retries=2):
params = {
"q": keyword,
"responseFilter": "webpages",
"safeSearch": "Off",
"mkt": market,
"count": count,
"offset": offset
}
try:
r = requests.get('%s?%s' % (Bing.SEARCH_ENDPOINT, urllib.urlencode(params)),
headers={'Ocp-Apim-Subscription-Key': self.key})
result = r.json()
if result.get('_type') != 'SearchResponse':
raise Exception('Not a valid response')
return result
except:
if retries > 0:
return self.get_search_results(keyword, market, count, offset, retries - 1)
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment