Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import requests
from bs4 import BeautifulSoup
import time
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def fetch_results(search_term, number_results, language_code):
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code)
response = requests.get(google_url, headers=USER_AGENT)
response.raise_for_status()
return search_term, response.text
def parse_results(html, keyword):
soup = BeautifulSoup(html, 'html.parser')
found_results = []
rank = 1
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3', attrs={'class': 'r'})
description = result.find('span', attrs={'class': 'st'})
if link and title:
link = link['href']
title = title.get_text()
if description:
description = description.get_text()
if link != '#':
found_results.append({'keyword': keyword, 'rank': rank, 'title': title, 'description': description, 'link': link})
rank += 1
return found_results
def scrape_google(search_term, number_results, language_code):
try:
keyword, html = fetch_results(search_term, number_results, language_code)
results = parse_results(html, keyword)
return results
except AssertionError:
raise Exception("Incorrect arguments parsed to function")
except requests.HTTPError:
raise Exception("You appear to have been blocked by Google")
except requests.RequestException:
raise Exception("Appears to be an issue with your connection")
if __name__ == '__main__':
keywords = ['edmund martin', 'python', 'google scraping']
data = []
for keyword in keywords:
try:
results = scrape_google(keyword, 100, "en")
for result in results:
data.append(result)
except Exception as e:
print(e)
finally:
time.sleep(10)
print(data)
@meckin

This comment has been minimized.

Copy link

commented Nov 9, 2017

Have you seen an increase in blocking based on language requesting?

@Lowell130

This comment has been minimized.

Copy link

commented Jan 7, 2018

Hello how i can add these:
metatags = soup.find_all('meta',attrs={'name':'generator'})

@cabbage-dealer

This comment has been minimized.

Copy link

commented Mar 2, 2018

When i try to use it for the keyword "Commbank" i get this error:
'NoneType' object has no attribute 'get_text'

@rajrsingh

This comment has been minimized.

Copy link

commented Jun 8, 2018

On line 36 don't you want to include the link in the dictionary you append to found_results?

@EdmundMartin

This comment has been minimized.

Copy link
Owner Author

commented Jun 11, 2018

@rajrsingh Thanks for pointing this out. I have made the changes.

@cabbage-dealer The code on the blog was updated to fix this error. This happens when a result doesn't have a standard description. I have also updated the code to avoid this issue.

@duchonghoang

This comment has been minimized.

Copy link

commented Jun 21, 2018

I keep getting all elements printed as in reversed (link ,description, title, tank, keyword) instead of (keyword, rank, title, description, link), sometimes it's not on order. I Google this and it's because the elements are contained in a set, rather than a list.
Is there a solution to print out elements in specific order?

@dripti

This comment has been minimized.

Copy link

commented Jul 4, 2018

How can i get results for different countries?

@CodeOctal

This comment has been minimized.

Copy link

commented Oct 30, 2018

Hi when I write this code into spyder I get the result as [ ].
What does this means.
Can you please help me out Iam new into this domain

@YuiyuD001

This comment has been minimized.

Copy link

commented May 25, 2019

@EdmundMartin
the result
result_block = soup.find_all('div', attrs={'class': 'g'})
return emty [], so what was i mistake, can you help

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.