Skip to content

Instantly share code, notes, and snippets.

@tshrinivasan
Created September 9, 2012 03:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tshrinivasan/3682456 to your computer and use it in GitHub Desktop.
Save tshrinivasan/3682456 to your computer and use it in GitHub Desktop.
import mechanize
import cookielib
# http://stockrt.github.com/p/emulating-a-browser-in-python-with-mechanize/
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
#This can work upto
#http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?c.0:1:20107.tamillex
#20107 pages
counter = 0
for page in range(20001):
url = "http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?c.0:1:" + str(page) +".tamillex"
print url
r = br.open(url)
content = r.read()
filename = "file" + str(counter) + ".html"
content = "<!-- " + br.geturl() + "-->" + "\n" + content
with open(filename, 'w') as fo:
fo.write(content)
counter = counter + 1
# it stopped at 3358.
#then changed the range to range(3359,20001) and counter as 3359
#it is running now
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment