Last active
December 19, 2015 16:18
-
-
Save XayOn/5982492 to your computer and use it in GitHub Desktop.
Better scrappery script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import mechanize | |
from random import choice | |
class Scrapper(object): | |
""" | |
Main scrapper object. | |
Use self.do_anonymous_scrapping after setting self.url | |
(or use scrap() function) | |
If safe is set to false and proxies.txt is not present it will | |
do the request unproxied. | |
TODO: Random sleeps and random useragents | |
""" | |
def __init__(self, safe=True): | |
""" | |
Set props | |
""" | |
self.browser = False | |
self.data = False | |
self.url = False | |
self.safe = safe | |
def scrap(self, url): | |
""" | |
Do the thing! | |
""" | |
self.url = url | |
self.do_anonymous_scrapping() | |
return self.data | |
def do_anonymous_scrapping(self): | |
""" | |
This configures a mechanize browser, | |
with the proxy, and does the request, setting | |
self.data | |
""" | |
self.browser = mechanize.Browser() | |
self.browser.set_handle_robots(False) | |
self.browser.set_handle_refresh(False) | |
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
proxy = self.proxy | |
if proxy or not self.safe: | |
self.browser.set_proxies({"http": proxy}) | |
elif not proxy and self.safe: | |
raise Exception("No proxies.txt and safe mode specified") | |
self.data = self.browser.open(self.url).read() | |
@property | |
def proxy(self): | |
""" | |
Get a random proxy | |
""" | |
try: | |
with open('proxies.txt') as proxy_file: | |
return choice(proxy_file.readlines()) | |
except IOError: | |
return False | |
if __name__ == "__main__": | |
Scrapper(False).scrap(sys.argv[1], False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment