peterbe/s3downloads.py

## s3downloads.py
import os
import io
from gzip import GzipFile
from urllib.parse import urlparse

import requests
import boto3


# ~18.4MB compressed
URL_BIG = (
    "https://s3-us-west-2.amazonaws.com/"
    "org.mozilla.crash-stats.symbols-public"
    "/v1/xul.pdb/C017F3ED83534FCE9CAA2057D8BCEE322/xul.sym"
)

# ~1.2MB compressed
URL_SMALL = (
    "https://s3-us-west-2.amazonaws.com/"
    "org.mozilla.crash-stats.symbols-public"
    "/v1/wntdll.pdb/D74F79EB1F8D4A45ABCD2F476CCABACC2/wntdll.sym"
)

s3 = boto3.resource('s3', 'us-west-2')


def f1(url):
    r = requests.get(url)
    return len(r.content)


def f2(url):
    r = requests.get(url, stream=True)
    buffer = io.BytesIO()
    for chunk in r.iter_content(chunk_size=512):
        if chunk:
            buffer.write(chunk)
    return len(buffer.getvalue())


def f3(url):  # same as f2 but bigger chunk size
    r = requests.get(url, stream=True)
    buffer = io.BytesIO()
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            buffer.write(chunk)
    return len(buffer.getvalue())


def f4(url):
    _, bucket_name, key = urlparse(url).path.split('/', 2)
    obj = s3.Object(
        bucket_name=bucket_name,
        key=key
    )
    buffer = io.BytesIO(obj.get()["Body"].read())
    try:
        got_text = GzipFile(None, 'rb', fileobj=buffer).read()
    except OSError:
        buffer.seek(0)
        got_text = buffer.read()
    return len(got_text)


def _stats(r):
    # returns the median, average and standard deviation of a sequence
    tot = sum(r)
    avg = tot/len(r)
    sdsq = sum([(i-avg)**2 for i in r])
    s = list(r)
    s.sort()
    return s[len(s)//2], avg, (sdsq/(len(r)-1 or 1))**.5


if __name__ == '__main__':
    # assert f1(URL_BIG) == 87794590
    # assert f1(URL_SMALL) == 1244266
    # assert f2(URL_BIG) == 87794590
    # assert f2(URL_SMALL) == 1244266
    # assert f3(URL_BIG) == 87794590
    # assert f3(URL_SMALL) == 1244266
    # assert f4(URL_BIG) == 87794590
    # assert f4(URL_SMALL) == 1244266

    import random
    import time
    functions = [f1, f2, f3, f4]
    results = {}
    for i in range(3):
        random.shuffle(functions)
        for url in URL_BIG, URL_SMALL:
            for f in functions:
                key = (f.__name__, os.path.basename(url))
                if key not in results:
                    results[key] = []
                t0 = time.time()
                res = f(url)
                t1 = time.time()
                # print(key, res, t1 - t0)
                results[key].append((t1 - t0, res))

    for f in sorted(results):
        times = [x[0] for x in results[f]]
        med, avg, std = _stats(times)
        print(f[0], '\t', f[1], '\t', round(med, 3), '\t', round(std, 3))
	import os
	import io
	from gzip import GzipFile
	from urllib.parse import urlparse

	import requests
	import boto3


	# ~18.4MB compressed
	URL_BIG = (
	"https://s3-us-west-2.amazonaws.com/"
	"org.mozilla.crash-stats.symbols-public"
	"/v1/xul.pdb/C017F3ED83534FCE9CAA2057D8BCEE322/xul.sym"
	)

	# ~1.2MB compressed
	URL_SMALL = (
	"https://s3-us-west-2.amazonaws.com/"
	"org.mozilla.crash-stats.symbols-public"
	"/v1/wntdll.pdb/D74F79EB1F8D4A45ABCD2F476CCABACC2/wntdll.sym"
	)

	s3 = boto3.resource('s3', 'us-west-2')


	def f1(url):
	r = requests.get(url)
	return len(r.content)


	def f2(url):
	r = requests.get(url, stream=True)
	buffer = io.BytesIO()
	for chunk in r.iter_content(chunk_size=512):
	if chunk:
	buffer.write(chunk)
	return len(buffer.getvalue())


	def f3(url): # same as f2 but bigger chunk size
	r = requests.get(url, stream=True)
	buffer = io.BytesIO()
	for chunk in r.iter_content(chunk_size=1024):
	if chunk:
	buffer.write(chunk)
	return len(buffer.getvalue())


	def f4(url):
	_, bucket_name, key = urlparse(url).path.split('/', 2)
	obj = s3.Object(
	bucket_name=bucket_name,
	key=key
	)
	buffer = io.BytesIO(obj.get()["Body"].read())
	try:
	got_text = GzipFile(None, 'rb', fileobj=buffer).read()
	except OSError:
	buffer.seek(0)
	got_text = buffer.read()
	return len(got_text)


	def _stats(r):
	# returns the median, average and standard deviation of a sequence
	tot = sum(r)
	avg = tot/len(r)
	sdsq = sum([(i-avg)**2 for i in r])
	s = list(r)
	s.sort()
	return s[len(s)//2], avg, (sdsq/(len(r)-1 or 1))**.5


	if __name__ == '__main__':
	# assert f1(URL_BIG) == 87794590
	# assert f1(URL_SMALL) == 1244266
	# assert f2(URL_BIG) == 87794590
	# assert f2(URL_SMALL) == 1244266
	# assert f3(URL_BIG) == 87794590
	# assert f3(URL_SMALL) == 1244266
	# assert f4(URL_BIG) == 87794590
	# assert f4(URL_SMALL) == 1244266

	import random
	import time
	functions = [f1, f2, f3, f4]
	results = {}
	for i in range(3):
	random.shuffle(functions)
	for url in URL_BIG, URL_SMALL:
	for f in functions:
	key = (f.__name__, os.path.basename(url))
	if key not in results:
	results[key] = []
	t0 = time.time()
	res = f(url)
	t1 = time.time()
	# print(key, res, t1 - t0)
	results[key].append((t1 - t0, res))

	for f in sorted(results):
	times = [x[0] for x in results[f]]
	med, avg, std = _stats(times)
	print(f[0], '\t', f[1], '\t', round(med, 3), '\t', round(std, 3))