Created
December 15, 2016 17:20
-
-
Save tomvangoethem/9dfe2d26006ce34bad8ff492bd1fa435 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import urllib | |
import urlparse | |
requests.packages.urllib3.disable_warnings() | |
# # Search for URLs using Bing's Congitive API | |
# # Usage: | |
# | |
# > b = Bing('subscription_key') | |
# > b.search('keyword') | |
# | |
# # Result: | |
# | |
# ['https://adwords.google.com/KeywordPlanner', 'http://keywordtool.io/', 'https://en.wikipedia.org/wiki/Keyword', 'http://www.googlekeywordtool.com/', 'http://www.dictionary.com/browse/keyword', 'https://msdn.microsoft.com/en-us/library/x53a06bb.aspx', 'https://moz.com/beginners-guide-to-seo/keyword-research', 'http://www.keyworddiscovery.com/', 'https://moz.com/products/pro/keyword-explorer', 'https://blog.google/', 'http://www.thefreedictionary.com/keyword', 'http://keyword.com/', 'http://www.webopedia.com/TERM/K/keyword.html', 'http://www.wordstream.com/seo-keyword', 'http://www.imdb.com/search/keyword/', 'http://www.wordstream.com/keyword', 'https://support.google.com/adwords/answer/2999770?hl=en', 'http://www.webconfs.com/seo-tools/keyword-density-checker/', 'http://www.bing.com/toolbox/keywords/', 'https://knowledge.hubspot.com/keyword-user-guide-v2/understanding-keywords', 'https://www.lifewire.com/keywords-and-how-they-rank-3482895', 'https://helpx.adobe.com/lightroom/help/keywords.html', 'https://www.wordtracker.com/', 'https://blog.hubspot.com/marketing/how-to-do-keyword-research-ht', 'http://www.keyword.io/', 'https://www.techopedia.com/definition/1215/keyword-seo', 'http://www.internetmarketingninjas.com/blog/search-engine-optimization/12-free-keyword-tools-replace-googles-keyword-tool/', 'https://en.wikipedia.org/wiki/Reserved_word', 'http://searchengineland.com/threefree-keyword-tool-reviews-126217', 'http://www.royalgames.com/games/word-games/keyword/?language=en_US', 'https://msdn.microsoft.com/en-us/library/dk1507sz.aspx', 'http://keywordtool.io/youtube', 'http://webdesign.about.com/od/seo/a/keywords-html.htm', 'http://www.w3schools.com/tags/tag_meta.asp', 'https://ahrefs.com/blog/keyword-research/', 'http://www.columbia.edu/cu/lweb/help/clio/keyword.html', 'http://backlinko.com/keyword-research', 'http://contentmarketinginstitute.com/2016/04/proper-keyword-research/', 'http://www.thesaurus.com/browse/keyword', 'http://tools.seobook.com/keyword-tools/seobook/', 'http://www.dictionary.com/browse/keywords', 'http://www.keywordspy.com/', 'https://www.biblegateway.com/keyword/', 'http://www.webopedia.com/TERM/K/keyword_search.html', 'http://www.yourdictionary.com/keyword', 'http://webdesign.about.com/od/seo/a/keywords-html.htm', 'http://www.keyworddiscovery.com/search.html', 'http://tools.seobook.com/keyword-list/', 'https://ubersuggest.io/', 'http://lsigraph.com/'] | |
class Bing(object): | |
SEARCH_ENDPOINT = 'https://api.cognitive.microsoft.com/bing/v5.0/search' | |
MAX_COUNT_PER_QUERY = 50 | |
def __init__(self, subscription_key): | |
self.key = subscription_key | |
def unwind_entry(self, entry, retries=2): | |
try: | |
parsed = urlparse.urlparse(entry['url']) | |
if 'bing.com' in parsed.netloc: | |
response = requests.head(entry['url'], allow_redirects=False, stream=True) | |
if not response: | |
raise Exception('No valid response') | |
new_location = response.headers.get('Location', '') | |
parsed = urlparse.urlparse(new_location) | |
if parsed.netloc == '': | |
raise Exception('Invalid redirect location.') | |
entry['url'] = new_location | |
return entry | |
except: | |
if retries > 0: | |
return self.unwind_entry(entry, retries - 1) | |
return None | |
def search(self, keyword, market='en-US', count=50, urls_only=True, unwind_bing_urls=True): | |
continue_querying = True | |
offset = 0 | |
all_search_results = [] | |
while continue_querying: | |
num_to_search = min(Bing.MAX_COUNT_PER_QUERY, count) | |
search_results = self.get_search_results(keyword, market, num_to_search, offset) | |
if search_results is not None and 'webPages' in search_results: | |
if len(search_results['webPages'].get('value', ())) > 0: | |
all_search_results.extend(search_results['webPages']['value']) | |
else: | |
# Most likely there are no more results | |
continue_querying = False | |
offset += num_to_search | |
# We've queried for enough results | |
if search_results['webPages'].get('totalEstimatedMatches', 0) <= offset: | |
continue_querying = False | |
# Failsafe: number of results should not exceed count | |
if len(all_search_results) >= count: | |
all_search_results = all_search_results[:count] | |
continue_querying = False | |
# Failsafe 2: do not return more than 1000 results | |
if offset >= 1000: | |
continue_querying = False | |
else: | |
continue_querying = False | |
all_search_results = filter(lambda x: x.get('url', False), all_search_results) | |
if unwind_bing_urls: | |
all_search_results = map(self.unwind_entry, all_search_results) | |
if urls_only: | |
return [entry['url'] for entry in all_search_results if entry is not None] | |
return all_search_results | |
def get_search_results(self, keyword, market='en-US', count=50, offset=0, retries=2): | |
params = { | |
"q": keyword, | |
"responseFilter": "webpages", | |
"safeSearch": "Off", | |
"mkt": market, | |
"count": count, | |
"offset": offset | |
} | |
try: | |
r = requests.get('%s?%s' % (Bing.SEARCH_ENDPOINT, urllib.urlencode(params)), | |
headers={'Ocp-Apim-Subscription-Key': self.key}) | |
result = r.json() | |
if result.get('_type') != 'SearchResponse': | |
raise Exception('Not a valid response') | |
return result | |
except: | |
if retries > 0: | |
return self.get_search_results(keyword, market, count, offset, retries - 1) | |
return None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment