Skip to content

Instantly share code, notes, and snippets.

@XayOn
Last active December 19, 2015 16:18
Show Gist options
  • Save XayOn/5982492 to your computer and use it in GitHub Desktop.
Save XayOn/5982492 to your computer and use it in GitHub Desktop.
Better scrappery script
import sys
import mechanize
from random import choice
class Scrapper(object):
"""
Main scrapper object.
Use self.do_anonymous_scrapping after setting self.url
(or use scrap() function)
If safe is set to false and proxies.txt is not present it will
do the request unproxied.
TODO: Random sleeps and random useragents
"""
def __init__(self, safe=True):
"""
Set props
"""
self.browser = False
self.data = False
self.url = False
self.safe = safe
def scrap(self, url):
"""
Do the thing!
"""
self.url = url
self.do_anonymous_scrapping()
return self.data
def do_anonymous_scrapping(self):
"""
This configures a mechanize browser,
with the proxy, and does the request, setting
self.data
"""
self.browser = mechanize.Browser()
self.browser.set_handle_robots(False)
self.browser.set_handle_refresh(False)
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
proxy = self.proxy
if proxy or not self.safe:
self.browser.set_proxies({"http": proxy})
elif not proxy and self.safe:
raise Exception("No proxies.txt and safe mode specified")
self.data = self.browser.open(self.url).read()
@property
def proxy(self):
"""
Get a random proxy
"""
try:
with open('proxies.txt') as proxy_file:
return choice(proxy_file.readlines())
except IOError:
return False
if __name__ == "__main__":
Scrapper(False).scrap(sys.argv[1], False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment