Created
June 1, 2015 15:31
-
-
Save ayushgoel/075502787a85ca7e4e65 to your computer and use it in GitHub Desktop.
Google code closure lead this script here. Gives an iterator over google search results.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
## DESCRIPTION | |
## The script provides an iterator over the search results returned by google, after giving a query. | |
## | |
## EXAMPLES | |
## TODO: Show some examples of how to use this script. | |
## | |
## EXIT STATUS | |
## No exit statuses provided. The class simply returns an iterator over the results. Only error raised is the StopIteration. | |
## | |
## AUTHOR | |
## Ayush Goel <ayushgoel111@gmail.com> | |
## | |
## LICENSE | |
## GPL v3 used | |
## uploaded on code http://code.google.com/p/google-search-iter/ | |
## This script is in the public domain, free from copyrights or restrictions. | |
## Please revert back if you make any updations or changes useful to others too. I would add the changes to the original script. | |
## | |
## VERSION | |
## 0.1 | |
import mechanize | |
class GoogleIter(): | |
def __init__(self,query): | |
"""create the iterator by giving a query term""" | |
self.query=query | |
self.link_no=-1 | |
self.links_end=False | |
def find_next_link_ini(self): | |
""" this function is not to be used externally.\n It is called internally when the iterator is being formed initially.""" | |
try: | |
self.br=mechanize.Browser() | |
# Browser options | |
self.br.set_handle_equiv(True) | |
self.br.set_handle_gzip(True) #experimental! Shows a warning which should be neglected.. | |
self.br.set_handle_redirect(True) | |
self.br.set_handle_referer(True) | |
self.br.set_handle_robots(False) | |
# Want debugging messages? | |
#self.br.set_debug_http(True) | |
#self.br.set_debug_redirects(True) | |
#self.br.set_debug_responses(True) | |
# User-Agent (this is cheating, ok?) | |
# Here we customize our emulated browser to pose as it is the firefox browser. | |
self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
self.br.open("http://www.google.co.in") | |
except UserWarning: | |
pass | |
self.br.select_form(nr=0) | |
self.br.form['q']=self.query | |
self.br.submit() | |
list_links=list(self.br.links()) | |
ret_links=[] | |
changed=False ## this acts like a flag to check if this is the last page, and the results are going to end after this. | |
## for i in range(len(list_links)): | |
## if list_links[i].text=='Cached': | |
## ret_links.append(list_links[i-1]) | |
## elif list_links[i].text=='Next': | |
## changed=1 | |
## self.following_link=list_links[i] | |
## if changed==0: | |
## self.links_end=True | |
## | |
for i in range(len(list_links)): | |
if list_links[i].text=='Cached': | |
if list_links[i-1].text=='Translate this page': | |
ret_links.append(list_links[i-2]) | |
print list_links[i-2].text | |
else: | |
ret_links.append(list_links[i-1]) | |
elif list_links[i].text=='Similar' and list_links[i-1].text!='Cached': | |
ret_links.append(list_links[i-1]) | |
print list_links[i-1].text | |
elif list_links[i].text=='Next': | |
changed=True | |
self.following_link=list_links[i] | |
if changed==False: | |
self.links_end=True | |
return ret_links | |
## self.br.follow_link(following_link) | |
## follow the google results to next page.. | |
def find_next_link_list(self): | |
""" This function is not available externally. | |
This is the function that returns the links one by one to the iterator object.""" | |
self.br.follow_link(self.following_link) | |
list_links=list(self.br.links()) | |
ret_links=[] | |
changed=False | |
for i in range(len(list_links)): | |
if list_links[i].text=='Cached': | |
if list_links[i-1].text=='Translate this page': | |
ret_links.append(list_links[i-2]) | |
print list_links[i-2].text | |
else: | |
ret_links.append(list_links[i-1]) | |
elif list_links[i].text=='Similar' and list_links[i-1].text!='Cached': | |
ret_links.append(list_links[i-1]) | |
print list_links[i-1].text | |
elif list_links[i].text=='Next': | |
changed=True | |
self.following_link=list_links[i] | |
if changed==False: | |
self.links_end=True | |
return ret_links | |
def find_next_link(self): | |
if self.link_no==-1: | |
self.link_list=self.find_next_link_ini() | |
self.link_no=0 | |
self.link_limit=len(self.link_list) | |
elif self.link_no>=self.link_limit: | |
self.link_list=self.find_next_link_list() | |
self.link_limit=len(self.link_list) | |
self.link_no=0 | |
self.link_no+=1 | |
return self.link_list[self.link_no-1] | |
def __iter__(self): | |
""" this method is called when we use iter(xxx)""" | |
self.link=self.find_next_link() | |
return self | |
def next(self): | |
"""the iterator's next() method definition""" | |
if self.links_end==True and self.link_no==self.link_limit : | |
raise StopIteration | |
return self.find_next_link() | |
if __name__=='__main__': | |
##if the script is executed as a standalone. | |
query=raw_input("Enter the searh term: ") | |
google_iter=GoogleIter(query) | |
print "An iterator named google_iter has been created" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment