Skip to content

Instantly share code, notes, and snippets.

@ayushgoel
Created June 1, 2015 15:31
Show Gist options
  • Save ayushgoel/075502787a85ca7e4e65 to your computer and use it in GitHub Desktop.
Save ayushgoel/075502787a85ca7e4e65 to your computer and use it in GitHub Desktop.
Google code closure lead this script here. Gives an iterator over google search results.
#!/usr/bin/env python
## DESCRIPTION
## The script provides an iterator over the search results returned by google, after giving a query.
##
## EXAMPLES
## TODO: Show some examples of how to use this script.
##
## EXIT STATUS
## No exit statuses provided. The class simply returns an iterator over the results. Only error raised is the StopIteration.
##
## AUTHOR
## Ayush Goel <ayushgoel111@gmail.com>
##
## LICENSE
## GPL v3 used
## uploaded on code http://code.google.com/p/google-search-iter/
## This script is in the public domain, free from copyrights or restrictions.
## Please revert back if you make any updations or changes useful to others too. I would add the changes to the original script.
##
## VERSION
## 0.1
import mechanize
class GoogleIter():
def __init__(self,query):
"""create the iterator by giving a query term"""
self.query=query
self.link_no=-1
self.links_end=False
def find_next_link_ini(self):
""" this function is not to be used externally.\n It is called internally when the iterator is being formed initially."""
try:
self.br=mechanize.Browser()
# Browser options
self.br.set_handle_equiv(True)
self.br.set_handle_gzip(True) #experimental! Shows a warning which should be neglected..
self.br.set_handle_redirect(True)
self.br.set_handle_referer(True)
self.br.set_handle_robots(False)
# Want debugging messages?
#self.br.set_debug_http(True)
#self.br.set_debug_redirects(True)
#self.br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
# Here we customize our emulated browser to pose as it is the firefox browser.
self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
self.br.open("http://www.google.co.in")
except UserWarning:
pass
self.br.select_form(nr=0)
self.br.form['q']=self.query
self.br.submit()
list_links=list(self.br.links())
ret_links=[]
changed=False ## this acts like a flag to check if this is the last page, and the results are going to end after this.
## for i in range(len(list_links)):
## if list_links[i].text=='Cached':
## ret_links.append(list_links[i-1])
## elif list_links[i].text=='Next':
## changed=1
## self.following_link=list_links[i]
## if changed==0:
## self.links_end=True
##
for i in range(len(list_links)):
if list_links[i].text=='Cached':
if list_links[i-1].text=='Translate this page':
ret_links.append(list_links[i-2])
print list_links[i-2].text
else:
ret_links.append(list_links[i-1])
elif list_links[i].text=='Similar' and list_links[i-1].text!='Cached':
ret_links.append(list_links[i-1])
print list_links[i-1].text
elif list_links[i].text=='Next':
changed=True
self.following_link=list_links[i]
if changed==False:
self.links_end=True
return ret_links
## self.br.follow_link(following_link)
## follow the google results to next page..
def find_next_link_list(self):
""" This function is not available externally.
This is the function that returns the links one by one to the iterator object."""
self.br.follow_link(self.following_link)
list_links=list(self.br.links())
ret_links=[]
changed=False
for i in range(len(list_links)):
if list_links[i].text=='Cached':
if list_links[i-1].text=='Translate this page':
ret_links.append(list_links[i-2])
print list_links[i-2].text
else:
ret_links.append(list_links[i-1])
elif list_links[i].text=='Similar' and list_links[i-1].text!='Cached':
ret_links.append(list_links[i-1])
print list_links[i-1].text
elif list_links[i].text=='Next':
changed=True
self.following_link=list_links[i]
if changed==False:
self.links_end=True
return ret_links
def find_next_link(self):
if self.link_no==-1:
self.link_list=self.find_next_link_ini()
self.link_no=0
self.link_limit=len(self.link_list)
elif self.link_no>=self.link_limit:
self.link_list=self.find_next_link_list()
self.link_limit=len(self.link_list)
self.link_no=0
self.link_no+=1
return self.link_list[self.link_no-1]
def __iter__(self):
""" this method is called when we use iter(xxx)"""
self.link=self.find_next_link()
return self
def next(self):
"""the iterator's next() method definition"""
if self.links_end==True and self.link_no==self.link_limit :
raise StopIteration
return self.find_next_link()
if __name__=='__main__':
##if the script is executed as a standalone.
query=raw_input("Enter the searh term: ")
google_iter=GoogleIter(query)
print "An iterator named google_iter has been created"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment