140am/http.py

## http.py
""" Python HTTP client implementations benchmark

# Bandwidth Throughput

INFO:__main__:TEST: pycurl
INFO:__main__:pycurl: Returned status in 17 ms
INFO:__main__:Completed read of 1527248057 bytes in 15410 ms at 756 Mbps

INFO:__main__:TEST: urllib2
INFO:__main__:urllib2: Returned status in 18 ms
INFO:__main__:Completed read of 1527248057 bytes in 14554 ms at 800 Mbps

INFO:__main__:TEST: urllib3
INFO:__main__:urllib3: Returned status in 3 ms
INFO:__main__:Completed read of 1527248057 bytes in 15352 ms at 758 Mbps

INFO:__main__:TEST: requests
INFO:__main__:requests: Returned status in 35 ms
INFO:__main__:Completed read of 1527248057 bytes in 14534 ms at 801 Mbps

# CPU / Memory Usage

-- curl
32219 cdn       20   0  169396   7664   3584 S  25.6  0.0   0:02.52 python
-- urllib2
32281 cdn       20   0  189632   9428   3920 S  29.5  0.0   0:06.09 python
-- urllib3
33218 cdn       20   0  202200  11476   3852 S  27.6  0.0   0:02.44 python
-- urllib3 + gevent
33286 cdn       20   0  285612  12764   4260 R  80.7  0.0   0:06.58 python
-- requests
32219 cdn       20   0  213052  13892   4268 S  37.4  0.0   0:16.52 python

"""
import time
import os
import pycurl
import cStringIO
import asyncore
import socket
import logging


logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


class HTTPClient(asyncore.dispatcher):

    def __init__(self, host, path):
        asyncore.dispatcher.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect( (host, 80) )
        self.buffer = (
            'GET %s HTTP/1.1\n'
            'Accept: */*\n'
            'Accept-Encoding: gzip, deflate, compress\n'
            'Host: %s\n'
            'User-Agent: Beluga CDNode/0.9.0\r\n\r\n'
            ) % (path, host)
        self.cc = 0

    def handle_connect(self):
        pass

    def handle_close(self):
        self.close()

    def handle_read(self):
        buf = self.recv(1024 * 64)
        self.cc += len(buf)
        return buf

    def writable(self):
        return (len(self.buffer) > 0)

    def handle_write(self):
        sent = self.send(self.buffer)
        self.buffer = self.buffer[sent:]

def print_stat(time_s, file_size):

    runtime = (time.time() * 1000) - time_s

    download_rate = (file_size / (runtime/1000)) * 8
    download_rate = download_rate / 1024 / 1024

    log.info('Completed read of %i bytes in %i ms at %i Mbps' % (
        file_size, runtime, download_rate
        ))

if __name__ == "__main__":

    test = ['pycurl', 'requests', 'urllib2', 'urllib3']
    test = ['urllib3']

    file_input = 'http://pcdn.adam.gs/test.mp4'
    file_output = 'test.mp4'

    CHUNK_SIZE = 1024 * 64

    if test and 'geventhttpclient' in test:

        log.info('TEST: geventhttpclient')

        def fetch_page(http, url):
            log.info('gevent greenlet')

            file_size = 0

            time_s = time.time() * 1000

            response = http.get(url.request_uri)

            assert response.status_code == 200

            log.info('geventhttpclient: Returned status in %i ms' % (
                (time.time() * 1000) - time_s
                ))

            chunk = response.read(CHUNK_SIZE)
            while chunk:
                file_size += len(chunk)
                chunk = response.read(CHUNK_SIZE)
                gevent.sleep(0)

            print_stat(time_s, file_size)

        import gevent
        import gevent.pool
        from geventhttpclient.url import URL
        url = URL(file_input)

        from geventhttpclient import HTTPClient as GeventHTTPClient
        http = GeventHTTPClient.from_url(url, concurrency=10)

        CON = 2

        pool = gevent.pool.Pool(CON)
        for i in range(CON):
            log.info('connecting to: %s' % url)
            pool.spawn(fetch_page, http, url)

        """
        with open(file_output, 'w') as fp:
            data = response.read(CHUNK_SIZE)
            while data:
                fp.write(data)
                data = response.read(CHUNK_SIZE)
        """

        pool.join()
        http.close()

    time.sleep(1)

    if test and 'pycurl' in test:

        log.info('TEST: pycurl')

        time_s = time.time() * 1000

        file_size = 0

        temp_buffer_head = cStringIO.StringIO()
        temp_buffer_body = cStringIO.StringIO()

        printed_header = None

        def temp_write_header(header):
            global printed_header
            if not printed_header:
                printed_header = True
                log.info('pycurl: Returned status in %i ms' % (
                    (time.time() * 1000) - time_s
                    ))
            # look for first CRLF after header response
            if header.find('\r\n\r\n') != -1:
                log.warn('HEADER DONE')
            temp_buffer_head.write(header)
            return len(header)

        def temp_write_func(chunk):
            #log.info('READ CHUNK: %i' % len(chunk))
            return len(chunk)

        def print_progress(download_t, download_d, upload_t, upload_d):
            log.info(
                "Total to download %d bytes, have %d bytes so far" % (
                    download_t, download_d
                ))

        curl = pycurl.Curl()
        curl.setopt(pycurl.URL, file_input)
        curl.setopt(pycurl.CONNECTTIMEOUT, 10)
        curl.setopt(pycurl.TIMEOUT, 300)
        curl.setopt(pycurl.HTTPHEADER, ["Accept:"])
        #curl.setopt(pycurl.WRITEHEADER, temp_buffer_head)
        #curl.setopt(pycurl.WRITEDATA, temp_buffer_body)
        curl.setopt(pycurl.HEADERFUNCTION, temp_write_header)
        curl.setopt(pycurl.WRITEFUNCTION, temp_write_func)
        curl.setopt(curl.NOPROGRESS, 1)
        #curl.setopt(curl.PROGRESSFUNCTION, print_progress)
        curl.setopt(pycurl.FOLLOWLOCATION, 1)
        curl.setopt(pycurl.MAXREDIRS, 5)
        #curl.setopt(curl.NOBODY, True)
        curl.setopt(curl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)")
        curl.perform()

        file_size = curl.getinfo(curl.CONTENT_LENGTH_DOWNLOAD)

        log.info("HTTP-code: %s" % curl.getinfo(curl.HTTP_CODE))
        log.info("Total-time: %s" % curl.getinfo(curl.TOTAL_TIME))
        log.info("Document size: %d bytes" % curl.getinfo(curl.SIZE_DOWNLOAD))
        log.info("Effective URL: %s" % curl.getinfo(curl.EFFECTIVE_URL))
        log.info("Content-type: %s" % curl.getinfo(curl.CONTENT_TYPE))
        log.info("Namelookup-time: %s" % curl.getinfo(curl.NAMELOOKUP_TIME))
        log.info("Redirect-time: %s" % curl.getinfo(curl.REDIRECT_TIME))
        log.info("Redirect-count: %s" % curl.getinfo(curl.REDIRECT_COUNT))
        epoch = curl.getinfo(curl.INFO_FILETIME)
        log.info("Filetime: %d (%s)" % (epoch, time.ctime(epoch)))

        temp_buffer_head.flush()
        log.info('HEADER: %s' % temp_buffer_head.getvalue())

        curl.close()
        temp_buffer_head.close()
        temp_buffer_body.close()

        print_stat(time_s, file_size)

    time.sleep(1)

    if test and 'urllib2' in test:

        log.info('TEST: urllib2')

        time_s = time.time() * 1000

        file_size = 0

        import gevent
        from gevent import monkey
        monkey.patch_all()
        #import geventhttpclient.httplib
        #geventhttpclient.httplib.patch()

        import urllib2
        req = urllib2.urlopen(file_input)

        log.info('urllib2: Returned status in %i ms' % (
            (time.time() * 1000) - time_s
            ))

        while True:
            chunk = req.read(CHUNK_SIZE)
            if not chunk: break
            file_size += len(chunk)
            #output.write(chunk)
            gevent.sleep(0)

        print_stat(time_s, file_size)

    time.sleep(1)

    if test and 'urllib3' in test:

        log.info('TEST: urllib3')

        file_size = 0

        from gevent import monkey
        monkey.patch_all()
        import urllib3

        # LRU of 10 connections
        http = urllib3.PoolManager(
            num_pools=10
            )

        req = http.request(
            method='HEAD',
            url=file_input,
            preload_content=False
            )

        time_s = time.time() * 1000

        req = http.request(
            method='GET',
            url=file_input,
            preload_content=False
            )

        log.info('urllib3: Returned status in %i ms' % (
            (time.time() * 1000) - time_s
            ))

        for chunk in req.stream():
            if not chunk: break
            file_size += len(chunk)
            #output.write(chunk)

        req.release_conn()

        print_stat(time_s, file_size)

    time.sleep(1)

    if test and 'asyncore' in test:

        log.info('TEST: asyncore')

        time_s = time.time() * 1000

        file_size = 0

        import urlparse
        url_obj = urlparse.urlparse(file_input)

        client = HTTPClient(url_obj.netloc, url_obj.path)

        asyncore.loop()

        log.info('asyncore: Returned status in %i ms' % (
            (time.time() * 1000) - time_s
            ))

        print_stat(time_s, client.cc)

    time.sleep(1)

    if test and 'requests' in test:

        log.info('TEST: requests')
        import requests

        time_s = time.time() * 1000

        file_size = 0

        r = requests.get(file_input, stream=True)

        log.info('requests: Returned status in %i ms' % (
            (time.time() * 1000) - time_s
            ))

        for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
            if chunk:
                file_size += len(chunk)

        print_stat(time_s, file_size)

    time.sleep(1)

    ##
    ## write tests

    if test and 'urlretrieve' in test:

        log.info('TEST: urlretrieve')

        time_s = time.time() * 1000

        import urllib
        urllib.urlretrieve(file_input, file_output)

        file_size = os.stat(file_output).st_size

        print_stat(time_s, file_size)

    time.sleep(1)

    if test and 'urllib2-disk' in test:

        log.info('TEST: urllib2-disk')

        time_s = time.time() * 1000

        import urllib2
        req = urllib2.urlopen(file_input)

        with open(file_output, 'wb') as fp:
            while True:
                chunk = req.read(CHUNK_SIZE)
                if not chunk: break
                fp.write(chunk)

        file_size = os.stat(file_output).st_size

        print_stat(time_s, file_size)

    time.sleep(1)

    if test and 'shutil' in test:

        log.info('TEST: shutil')

        time_s = time.time() * 1000

        import shutil
        import urllib2
        req = urllib2.urlopen(file_input)

        with open(file_output, 'wb') as fp:
            shutil.copyfileobj(req, fp)

        file_size = os.stat(file_output).st_size

        print_stat(time_s, file_size)
	""" Python HTTP client implementations benchmark

	# Bandwidth Throughput

	INFO:__main__:TEST: pycurl
	INFO:__main__:pycurl: Returned status in 17 ms
	INFO:__main__:Completed read of 1527248057 bytes in 15410 ms at 756 Mbps

	INFO:__main__:TEST: urllib2
	INFO:__main__:urllib2: Returned status in 18 ms
	INFO:__main__:Completed read of 1527248057 bytes in 14554 ms at 800 Mbps

	INFO:__main__:TEST: urllib3
	INFO:__main__:urllib3: Returned status in 3 ms
	INFO:__main__:Completed read of 1527248057 bytes in 15352 ms at 758 Mbps

	INFO:__main__:TEST: requests
	INFO:__main__:requests: Returned status in 35 ms
	INFO:__main__:Completed read of 1527248057 bytes in 14534 ms at 801 Mbps

	# CPU / Memory Usage

	-- curl
	32219 cdn 20 0 169396 7664 3584 S 25.6 0.0 0:02.52 python
	-- urllib2
	32281 cdn 20 0 189632 9428 3920 S 29.5 0.0 0:06.09 python
	-- urllib3
	33218 cdn 20 0 202200 11476 3852 S 27.6 0.0 0:02.44 python
	-- urllib3 + gevent
	33286 cdn 20 0 285612 12764 4260 R 80.7 0.0 0:06.58 python
	-- requests
	32219 cdn 20 0 213052 13892 4268 S 37.4 0.0 0:16.52 python

	"""
	import time
	import os
	import pycurl
	import cStringIO
	import asyncore
	import socket
	import logging


	logging.basicConfig(level=logging.INFO)
	log = logging.getLogger(__name__)


	class HTTPClient(asyncore.dispatcher):

	def __init__(self, host, path):
	asyncore.dispatcher.__init__(self)
	self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
	self.connect( (host, 80) )
	self.buffer = (
	'GET %s HTTP/1.1\n'
	'Accept: /\n'
	'Accept-Encoding: gzip, deflate, compress\n'
	'Host: %s\n'
	'User-Agent: Beluga CDNode/0.9.0\r\n\r\n'
	) % (path, host)
	self.cc = 0

	def handle_connect(self):
	pass

	def handle_close(self):
	self.close()

	def handle_read(self):
	buf = self.recv(1024 * 64)
	self.cc += len(buf)
	return buf

	def writable(self):
	return (len(self.buffer) > 0)

	def handle_write(self):
	sent = self.send(self.buffer)
	self.buffer = self.buffer[sent:]

	def print_stat(time_s, file_size):

	runtime = (time.time() * 1000) - time_s

	download_rate = (file_size / (runtime/1000)) * 8
	download_rate = download_rate / 1024 / 1024

	log.info('Completed read of %i bytes in %i ms at %i Mbps' % (
	file_size, runtime, download_rate
	))

	if __name__ == "__main__":

	test = ['pycurl', 'requests', 'urllib2', 'urllib3']
	test = ['urllib3']

	file_input = 'http://pcdn.adam.gs/test.mp4'
	file_output = 'test.mp4'

	CHUNK_SIZE = 1024 * 64

	if test and 'geventhttpclient' in test:

	log.info('TEST: geventhttpclient')

	def fetch_page(http, url):
	log.info('gevent greenlet')

	file_size = 0

	time_s = time.time() * 1000

	response = http.get(url.request_uri)

	assert response.status_code == 200

	log.info('geventhttpclient: Returned status in %i ms' % (
	(time.time() * 1000) - time_s
	))

	chunk = response.read(CHUNK_SIZE)
	while chunk:
	file_size += len(chunk)
	chunk = response.read(CHUNK_SIZE)
	gevent.sleep(0)

	print_stat(time_s, file_size)

	import gevent
	import gevent.pool
	from geventhttpclient.url import URL
	url = URL(file_input)

	from geventhttpclient import HTTPClient as GeventHTTPClient
	http = GeventHTTPClient.from_url(url, concurrency=10)

	CON = 2

	pool = gevent.pool.Pool(CON)
	for i in range(CON):
	log.info('connecting to: %s' % url)
	pool.spawn(fetch_page, http, url)

	"""
	with open(file_output, 'w') as fp:
	data = response.read(CHUNK_SIZE)
	while data:
	fp.write(data)
	data = response.read(CHUNK_SIZE)
	"""

	pool.join()
	http.close()

	time.sleep(1)

	if test and 'pycurl' in test:

	log.info('TEST: pycurl')

	time_s = time.time() * 1000

	file_size = 0

	temp_buffer_head = cStringIO.StringIO()
	temp_buffer_body = cStringIO.StringIO()

	printed_header = None

	def temp_write_header(header):
	global printed_header
	if not printed_header:
	printed_header = True
	log.info('pycurl: Returned status in %i ms' % (
	(time.time() * 1000) - time_s
	))
	# look for first CRLF after header response
	if header.find('\r\n\r\n') != -1:
	log.warn('HEADER DONE')
	temp_buffer_head.write(header)
	return len(header)

	def temp_write_func(chunk):
	#log.info('READ CHUNK: %i' % len(chunk))
	return len(chunk)

	def print_progress(download_t, download_d, upload_t, upload_d):
	log.info(
	"Total to download %d bytes, have %d bytes so far" % (
	download_t, download_d
	))

	curl = pycurl.Curl()
	curl.setopt(pycurl.URL, file_input)
	curl.setopt(pycurl.CONNECTTIMEOUT, 10)
	curl.setopt(pycurl.TIMEOUT, 300)
	curl.setopt(pycurl.HTTPHEADER, ["Accept:"])
	#curl.setopt(pycurl.WRITEHEADER, temp_buffer_head)
	#curl.setopt(pycurl.WRITEDATA, temp_buffer_body)
	curl.setopt(pycurl.HEADERFUNCTION, temp_write_header)
	curl.setopt(pycurl.WRITEFUNCTION, temp_write_func)
	curl.setopt(curl.NOPROGRESS, 1)
	#curl.setopt(curl.PROGRESSFUNCTION, print_progress)
	curl.setopt(pycurl.FOLLOWLOCATION, 1)
	curl.setopt(pycurl.MAXREDIRS, 5)
	#curl.setopt(curl.NOBODY, True)
	curl.setopt(curl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)")
	curl.perform()

	file_size = curl.getinfo(curl.CONTENT_LENGTH_DOWNLOAD)

	log.info("HTTP-code: %s" % curl.getinfo(curl.HTTP_CODE))
	log.info("Total-time: %s" % curl.getinfo(curl.TOTAL_TIME))
	log.info("Document size: %d bytes" % curl.getinfo(curl.SIZE_DOWNLOAD))
	log.info("Effective URL: %s" % curl.getinfo(curl.EFFECTIVE_URL))
	log.info("Content-type: %s" % curl.getinfo(curl.CONTENT_TYPE))
	log.info("Namelookup-time: %s" % curl.getinfo(curl.NAMELOOKUP_TIME))
	log.info("Redirect-time: %s" % curl.getinfo(curl.REDIRECT_TIME))
	log.info("Redirect-count: %s" % curl.getinfo(curl.REDIRECT_COUNT))
	epoch = curl.getinfo(curl.INFO_FILETIME)
	log.info("Filetime: %d (%s)" % (epoch, time.ctime(epoch)))

	temp_buffer_head.flush()
	log.info('HEADER: %s' % temp_buffer_head.getvalue())

	curl.close()
	temp_buffer_head.close()
	temp_buffer_body.close()

	print_stat(time_s, file_size)

	time.sleep(1)

	if test and 'urllib2' in test:

	log.info('TEST: urllib2')

	time_s = time.time() * 1000

	file_size = 0

	import gevent
	from gevent import monkey
	monkey.patch_all()
	#import geventhttpclient.httplib
	#geventhttpclient.httplib.patch()

	import urllib2
	req = urllib2.urlopen(file_input)

	log.info('urllib2: Returned status in %i ms' % (
	(time.time() * 1000) - time_s
	))

	while True:
	chunk = req.read(CHUNK_SIZE)
	if not chunk: break
	file_size += len(chunk)
	#output.write(chunk)
	gevent.sleep(0)

	print_stat(time_s, file_size)

	time.sleep(1)

	if test and 'urllib3' in test:

	log.info('TEST: urllib3')

	file_size = 0

	from gevent import monkey
	monkey.patch_all()
	import urllib3

	# LRU of 10 connections
	http = urllib3.PoolManager(
	num_pools=10
	)

	req = http.request(
	method='HEAD',
	url=file_input,
	preload_content=False
	)

	time_s = time.time() * 1000

	req = http.request(
	method='GET',
	url=file_input,
	preload_content=False
	)

	log.info('urllib3: Returned status in %i ms' % (
	(time.time() * 1000) - time_s
	))

	for chunk in req.stream():
	if not chunk: break
	file_size += len(chunk)
	#output.write(chunk)

	req.release_conn()

	print_stat(time_s, file_size)

	time.sleep(1)

	if test and 'asyncore' in test:

	log.info('TEST: asyncore')

	time_s = time.time() * 1000

	file_size = 0

	import urlparse
	url_obj = urlparse.urlparse(file_input)

	client = HTTPClient(url_obj.netloc, url_obj.path)

	asyncore.loop()

	log.info('asyncore: Returned status in %i ms' % (
	(time.time() * 1000) - time_s
	))

	print_stat(time_s, client.cc)

	time.sleep(1)

	if test and 'requests' in test:

	log.info('TEST: requests')
	import requests

	time_s = time.time() * 1000

	file_size = 0

	r = requests.get(file_input, stream=True)

	log.info('requests: Returned status in %i ms' % (
	(time.time() * 1000) - time_s
	))

	for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
	if chunk:
	file_size += len(chunk)

	print_stat(time_s, file_size)

	time.sleep(1)

	##
	## write tests

	if test and 'urlretrieve' in test:

	log.info('TEST: urlretrieve')

	time_s = time.time() * 1000

	import urllib
	urllib.urlretrieve(file_input, file_output)

	file_size = os.stat(file_output).st_size

	print_stat(time_s, file_size)

	time.sleep(1)

	if test and 'urllib2-disk' in test:

	log.info('TEST: urllib2-disk')

	time_s = time.time() * 1000

	import urllib2
	req = urllib2.urlopen(file_input)

	with open(file_output, 'wb') as fp:
	while True:
	chunk = req.read(CHUNK_SIZE)
	if not chunk: break
	fp.write(chunk)

	file_size = os.stat(file_output).st_size

	print_stat(time_s, file_size)

	time.sleep(1)

	if test and 'shutil' in test:

	log.info('TEST: shutil')

	time_s = time.time() * 1000

	import shutil
	import urllib2
	req = urllib2.urlopen(file_input)

	with open(file_output, 'wb') as fp:
	shutil.copyfileobj(req, fp)

	file_size = os.stat(file_output).st_size

	print_stat(time_s, file_size)