Skip to content

Instantly share code, notes, and snippets.

@e000
Created February 8, 2011 02:52
Show Gist options
  • Save e000/815758 to your computer and use it in GitHub Desktop.
Save e000/815758 to your computer and use it in GitHub Desktop.
proxies = []
from collections import deque
class rotate(deque):
def get(self):
item = self.pop()
self.appendleft(item)
return item
proxies = rotate(proxies)
from twisted.web.client import HTTPClientFactory, HTTPPageGetter
from twisted.internet import reactor
from twisted.python.util import InsensitiveDict
from twisted.internet import defer, protocol, reactor
from random import choice
import base64
class HTTPPageGetter(HTTPPageGetter):
def connectionMade(self):
method = getattr(self.factory, 'method', 'GET')
self.sendCommand(method, self.factory.url)
if self.factory.creds:
self.sendHeader('Proxy-authorization', 'Basic ' + base64.b64encode('%s:%s' % self.factory.creds).strip())
self.sendHeader('Host', self.factory.headers.get("host", self.factory.host))
self.sendHeader('User-Agent', self.factory.agent)
data = getattr(self.factory, 'postdata', None)
if data is not None:
self.sendHeader("Content-Length", str(len(data)))
cookieData = []
for (key, value) in self.factory.headers.items():
if key.lower() not in self._specialHeaders:
# we calculated it on our own
self.sendHeader(key, value)
if key.lower() == 'cookie':
cookieData.append(value)
for cookie, cookval in self.factory.cookies.items():
cookieData.append('%s=%s' % (cookie, cookval))
if cookieData:
self.sendHeader('Cookie', '; '.join(cookieData))
self.endHeaders()
self.headers = {}
if data is not None:
self.transport.write(data)
class HTTPClientFactory(HTTPClientFactory):
protocol = HTTPPageGetter
def __init__(self, url, creds = None, method='GET', postdata=None, headers=None,
agent="Twisted PageGetter", timeout=0, cookies=None,
followRedirect=True, redirectLimit=20,
afterFoundGet=False):
self.followRedirect = followRedirect
self.redirectLimit = redirectLimit
self._redirectCount = 0
self.timeout = timeout
self.agent = agent
self.afterFoundGet = afterFoundGet
if cookies is None:
cookies = {}
self.cookies = cookies
if headers is not None:
self.headers = InsensitiveDict(headers)
else:
self.headers = InsensitiveDict()
if postdata is not None:
self.headers.setdefault('Content-Length', len(postdata))
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("connection", "close")
self.postdata = postdata
self.method = method
self.setURL(url)
self.creds = creds
self.waiting = 1
self.deferred = defer.Deferred()
self.response_headers = None
_creds = None
def getPage(url, creds = None, contextFactory=None, *args, **kwargs):
"""
Download a web page as a string.
Download a page. Return a deferred, which will callback with a
page (as a string) or errback with a description of the error.
See HTTPClientFactory to see what extra args can be passed.
"""
host, port = proxies.get()
creds = creds or _creds
return _makeGetterFactory(
host,
port,
creds,
url,
HTTPClientFactory,
contextFactory=contextFactory,
*args, **kwargs).deferred
def _makeGetterFactory(phost, pport, creds, url, factoryFactory, contextFactory=None,
*args, **kwargs):
"""
Create and connect an HTTP page getting factory.
Any additional positional or keyword arguments are used when calling
C{factoryFactory}.
@param factoryFactory: Factory factory that is called with C{url}, C{args}
and C{kwargs} to produce the getter
@param contextFactory: Context factory to use when creating a secure
connection, defaulting to C{None}
@return: The factory created by C{factoryFactory}
"""
#scheme, host, port, path = _parse(url)
factory = factoryFactory(url, creds, *args, **kwargs)
reactor.connectTCP(phost, pport, factory)
return factory
if __name__ == '__main__':
def resp(resp):
print resp
getPage('http://www.google.com').addCallback(resp)
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment