Created
February 8, 2011 02:52
-
-
Save e000/815758 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
proxies = [] | |
from collections import deque | |
class rotate(deque): | |
def get(self): | |
item = self.pop() | |
self.appendleft(item) | |
return item | |
proxies = rotate(proxies) | |
from twisted.web.client import HTTPClientFactory, HTTPPageGetter | |
from twisted.internet import reactor | |
from twisted.python.util import InsensitiveDict | |
from twisted.internet import defer, protocol, reactor | |
from random import choice | |
import base64 | |
class HTTPPageGetter(HTTPPageGetter): | |
def connectionMade(self): | |
method = getattr(self.factory, 'method', 'GET') | |
self.sendCommand(method, self.factory.url) | |
if self.factory.creds: | |
self.sendHeader('Proxy-authorization', 'Basic ' + base64.b64encode('%s:%s' % self.factory.creds).strip()) | |
self.sendHeader('Host', self.factory.headers.get("host", self.factory.host)) | |
self.sendHeader('User-Agent', self.factory.agent) | |
data = getattr(self.factory, 'postdata', None) | |
if data is not None: | |
self.sendHeader("Content-Length", str(len(data))) | |
cookieData = [] | |
for (key, value) in self.factory.headers.items(): | |
if key.lower() not in self._specialHeaders: | |
# we calculated it on our own | |
self.sendHeader(key, value) | |
if key.lower() == 'cookie': | |
cookieData.append(value) | |
for cookie, cookval in self.factory.cookies.items(): | |
cookieData.append('%s=%s' % (cookie, cookval)) | |
if cookieData: | |
self.sendHeader('Cookie', '; '.join(cookieData)) | |
self.endHeaders() | |
self.headers = {} | |
if data is not None: | |
self.transport.write(data) | |
class HTTPClientFactory(HTTPClientFactory): | |
protocol = HTTPPageGetter | |
def __init__(self, url, creds = None, method='GET', postdata=None, headers=None, | |
agent="Twisted PageGetter", timeout=0, cookies=None, | |
followRedirect=True, redirectLimit=20, | |
afterFoundGet=False): | |
self.followRedirect = followRedirect | |
self.redirectLimit = redirectLimit | |
self._redirectCount = 0 | |
self.timeout = timeout | |
self.agent = agent | |
self.afterFoundGet = afterFoundGet | |
if cookies is None: | |
cookies = {} | |
self.cookies = cookies | |
if headers is not None: | |
self.headers = InsensitiveDict(headers) | |
else: | |
self.headers = InsensitiveDict() | |
if postdata is not None: | |
self.headers.setdefault('Content-Length', len(postdata)) | |
# just in case a broken http/1.1 decides to keep connection alive | |
self.headers.setdefault("connection", "close") | |
self.postdata = postdata | |
self.method = method | |
self.setURL(url) | |
self.creds = creds | |
self.waiting = 1 | |
self.deferred = defer.Deferred() | |
self.response_headers = None | |
_creds = None | |
def getPage(url, creds = None, contextFactory=None, *args, **kwargs): | |
""" | |
Download a web page as a string. | |
Download a page. Return a deferred, which will callback with a | |
page (as a string) or errback with a description of the error. | |
See HTTPClientFactory to see what extra args can be passed. | |
""" | |
host, port = proxies.get() | |
creds = creds or _creds | |
return _makeGetterFactory( | |
host, | |
port, | |
creds, | |
url, | |
HTTPClientFactory, | |
contextFactory=contextFactory, | |
*args, **kwargs).deferred | |
def _makeGetterFactory(phost, pport, creds, url, factoryFactory, contextFactory=None, | |
*args, **kwargs): | |
""" | |
Create and connect an HTTP page getting factory. | |
Any additional positional or keyword arguments are used when calling | |
C{factoryFactory}. | |
@param factoryFactory: Factory factory that is called with C{url}, C{args} | |
and C{kwargs} to produce the getter | |
@param contextFactory: Context factory to use when creating a secure | |
connection, defaulting to C{None} | |
@return: The factory created by C{factoryFactory} | |
""" | |
#scheme, host, port, path = _parse(url) | |
factory = factoryFactory(url, creds, *args, **kwargs) | |
reactor.connectTCP(phost, pport, factory) | |
return factory | |
if __name__ == '__main__': | |
def resp(resp): | |
print resp | |
getPage('http://www.google.com').addCallback(resp) | |
reactor.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment