Skip to content

Instantly share code, notes, and snippets.

@vivonk
Created December 18, 2017 13:16
Show Gist options
  • Save vivonk/1096bd6d23b3371eb0cbcf9356587d6f to your computer and use it in GitHub Desktop.
Save vivonk/1096bd6d23b3371eb0cbcf9356587d6f to your computer and use it in GitHub Desktop.
How to scrap google search content using python and scraping tools like urllibs, requests and scrapy
from __future__ import print_function
from generalized import Scraper
class Google(Scraper):
"""Scrapper class for Google"""
def __init__(self):
self.url = 'https://www.google.com/search'
self.defaultStart = 0
self.startKey = 'start'
def nextStart(self, currentStart, prevResults):
return currentStart + len(prevResults)
def parseResponse(self, soup):
"""
Parses the response and returns set of urls
Returns: urls (list)
[[Tile1,url1], [Title2, url2],..]
"""
urls = []
for h3 in soup.findAll('h3', {'class': 'r'}):
links = h3.find('a')
urls.append({'title': links.getText(), 'link': links.get('href')})
print('Google parsed: ' + str(urls))
return urls
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment