Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import requests
from bs4 import BeautifulSoup
import time
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def fetch_results(search_term, number_results, language_code):
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code)
response = requests.get(google_url, headers=USER_AGENT)
response.raise_for_status()
return search_term, response.text
def parse_results(html, keyword):
soup = BeautifulSoup(html, 'html.parser')
found_results = []
rank = 1
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3', attrs={'class': 'r'})
description = result.find('span', attrs={'class': 'st'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
if link != '#':
found_results.append({'keyword': keyword, 'rank': rank, 'title': title, 'description': description})
rank += 1
return found_results
def scrape_google(search_term, number_results, language_code):
try:
keyword, html = fetch_results(search_term, number_results, language_code)
results = parse_results(html, keyword)
return results
except AssertionError:
raise Exception("Incorrect arguments parsed to function")
except requests.HTTPError:
raise Exception("You appear to have been blocked by Google")
except requests.RequestException:
raise Exception("Appears to be an issue with your connection")
if __name__ == '__main__':
keywords = ['edmund martin', 'python', 'google scraping']
data = []
for keyword in keywords:
try:
results = scrape_google(keyword, 100, "en")
for result in results:
data.append(result)
except Exception as e:
print(e)
finally:
time.sleep(10)
print(data)
@meckin

This comment has been minimized.

Show comment Hide comment
@meckin

meckin Nov 9, 2017

Have you seen an increase in blocking based on language requesting?

meckin commented Nov 9, 2017

Have you seen an increase in blocking based on language requesting?

@Lowell130

This comment has been minimized.

Show comment Hide comment
@Lowell130

Lowell130 Jan 7, 2018

Hello how i can add these:
metatags = soup.find_all('meta',attrs={'name':'generator'})

Hello how i can add these:
metatags = soup.find_all('meta',attrs={'name':'generator'})

@cabbage-dealer

This comment has been minimized.

Show comment Hide comment
@cabbage-dealer

cabbage-dealer Mar 2, 2018

When i try to use it for the keyword "Commbank" i get this error:
'NoneType' object has no attribute 'get_text'

When i try to use it for the keyword "Commbank" i get this error:
'NoneType' object has no attribute 'get_text'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment