Skip to content

Instantly share code, notes, and snippets.

@laughingman7743
Created September 22, 2018 15:24
Show Gist options
  • Save laughingman7743/2e4d83ca4e394dc645e9ea9a45fe78ba to your computer and use it in GitHub Desktop.
Save laughingman7743/2e4d83ca4e394dc645e9ea9a45fe78ba to your computer and use it in GitHub Desktop.
PyAthena performance

Amazon Linux 2

  • r5.large
    • vCPU: 2
    • Memory: 16
    • Networking Performance: Up to 10 Gigabit
  • Install package
    • sudo yum install python3 python3-pip git
    • sudo pip3 install pipenv
$ uname -a
Linux ip-xxx-xxx-xxx-xxx.region.compute.internal 4.14.62-70.117.amzn2.x86_64 #1 SMP Fri Aug 10 20:14:53 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

$ python3 -V
Python 3.7.0rc1

Table

Download statistics from the Python Package Index. https://bigquery.cloud.google.com/dataset/the-psf:pypi

  • Date: 2018-09-15
  • Format: Avro (Export from BigQuery to GCS in Avro format and upload it to S3)
  • Schema: pypi_downloads_20180915.sql
  • Size: 11.73 GB
  • Row count: 36,757,860

ResultSet

  • Query: SELECT * FROM pypi_downloads_20180915 WHERE file.project = 'pip'
  • Size: 1,027,322,970
  • Row count: 1,923,322
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
QUERY = """
SELECT * FROM pypi_downloads_20180915
WHERE file.project = 'pip'
"""
S3_STAGING_DIR = 's3://YOUR_BUCKET/path/to/'
REGION_NAME = 'us-west-2'
COUNT = 10
def main():
print('PyAthena PandasCursor =========================')
cursor = connect(s3_staging_dir=S3_STAGING_DIR,
region_name=REGION_NAME,
cursor_class=PandasCursor).cursor()
avg = []
for i in range(0, COUNT):
start = time.time()
df = cursor.execute(QUERY).as_pandas()
end = time.time()
elapsed = end - start
print('loop:{0}\tresult:{1}\telasped:{2}'.format(i, df.shape[0], elapsed))
avg.append(elapsed)
avg = sum(avg) / COUNT
print('Avg: {0}'.format(avg))
print('===============================================')
print()
print('PyAthena Cursor ===============================')
cursor = connect(s3_staging_dir=S3_STAGING_DIR,
region_name=REGION_NAME).cursor()
avg = []
for i in range(0, COUNT):
start = time.time()
result = cursor.execute(QUERY).fetchall()
end = time.time()
elapsed = end - start
print('loop:{0}\tresult:{1}\telasped:{2}'.format(i, len(result), elapsed))
avg.append(elapsed)
avg = sum(avg) / COUNT
print('Avg: {0}'.format(avg))
print('===============================================')
print()
if __name__ == '__main__':
main()
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
pandas = "*"
pyathena = {git = "https://github.com/laughingman7743/PyAthena.git", ref = "add_pandas_cursor"}
[dev-packages]
[requires]
python_version = "3.7"
CREATE EXTERNAL TABLE pypi_downloads_20180915 (
timestamp BIGINT,
country_code STRING,
url STRING,
file STRUCT<
filename: STRING,
project: STRING,
version: STRING,
type: STRING
>,
details STRUCT<
installer: STRUCT<
name: STRING,
version: STRING
>,
python: STRING,
implementation: STRUCT<
name: STRING,
version: STRING
>,
distro: STRUCT<
name: STRING,
version: STRING,
id: STRING,
libc: STRUCT<
lib: STRING,
version: STRING
>
>,
system: STRUCT<
name: STRING,
release: STRING
>,
cpu: STRING,
openssl_version: STRING,
setuptools_version: STRING
>,
tls_protocol STRING,
tls_cipher STRING
)
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
WITH SERDEPROPERTIES ('avro.schema.literal'='
{
"type": "record",
"name": "Root",
"fields": [{
"name": "timestamp",
"type": {
"type": "long",
"logicalType": "timestamp-micros"
}
}, {
"name": "country_code",
"type": ["null", "string"]
}, {
"name": "url",
"type": "string"
}, {
"name": "file",
"type": {
"type": "record",
"namespace": "root",
"name": "File",
"fields": [{
"name": "filename",
"type": "string"
}, {
"name": "project",
"type": "string"
}, {
"name": "version",
"type": "string"
}, {
"name": "type",
"type": "string"
}]
}
}, {
"name": "details",
"type": ["null", {
"type": "record",
"namespace": "root",
"name": "Details",
"fields": [{
"name": "installer",
"type": ["null", {
"type": "record",
"namespace": "root.details",
"name": "Installer",
"fields": [{
"name": "name",
"type": ["null", "string"]
}, {
"name": "version",
"type": ["null", "string"]
}]
}]
}, {
"name": "python",
"type": ["null", "string"]
}, {
"name": "implementation",
"type": ["null", {
"type": "record",
"namespace": "root.details",
"name": "Implementation",
"fields": [{
"name": "name",
"type": ["null", "string"]
}, {
"name": "version",
"type": ["null", "string"]
}]
}]
}, {
"name": "distro",
"type": ["null", {
"type": "record",
"namespace": "root.details",
"name": "Distro",
"fields": [{
"name": "name",
"type": ["null", "string"]
}, {
"name": "version",
"type": ["null", "string"]
}, {
"name": "id",
"type": ["null", "string"]
}, {
"name": "libc",
"type": ["null", {
"type": "record",
"namespace": "root.details.distro",
"name": "Libc",
"fields": [{
"name": "lib",
"type": ["null", "string"]
}, {
"name": "version",
"type": ["null", "string"]
}]
}]
}]
}]
}, {
"name": "system",
"type": ["null", {
"type": "record",
"namespace": "root.details",
"name": "System",
"fields": [{
"name": "name",
"type": ["null", "string"]
}, {
"name": "release",
"type": ["null", "string"]
}]
}]
}, {
"name": "cpu",
"type": ["null", "string"]
}, {
"name": "openssl_version",
"type": ["null", "string"]
}, {
"name": "setuptools_version",
"type": ["null", "string"]
}]
}]
}, {
"name": "tls_protocol",
"type": ["null", "string"]
}, {
"name": "tls_cipher",
"type": ["null", "string"]
}]
}
')
STORED AS AVRO
LOCATION 's3://YOUR_BUCKET/path/to/';
PyAthena PandasCursor =========================
loop:0 result:1923322 elasped:48.798383474349976
loop:1 result:1923322 elasped:46.71501660346985
loop:2 result:1923322 elasped:45.6755907535553
loop:3 result:1923322 elasped:45.595343828201294
loop:4 result:1923322 elasped:45.6150267124176
loop:5 result:1923322 elasped:45.6240599155426
loop:6 result:1923322 elasped:46.58779001235962
loop:7 result:1923322 elasped:47.690996170043945
loop:8 result:1923322 elasped:47.47061204910278
loop:9 result:1923322 elasped:45.56822085380554
Avg: 46.53410403728485
===============================================
PyAthena Cursor ===============================
loop:0 result:1923322 elasped:615.0867228507996
loop:1 result:1923322 elasped:683.0776832103729
loop:2 result:1923322 elasped:681.3894004821777
loop:3 result:1923322 elasped:607.708498954773
loop:4 result:1923322 elasped:616.3735427856445
loop:5 result:1923322 elasped:618.806414604187
loop:6 result:1923322 elasped:573.4212465286255
loop:7 result:1923322 elasped:636.8158755302429
loop:8 result:1923322 elasped:605.3360025882721
loop:9 result:1923322 elasped:649.157306432724
Avg: 628.717269396782
===============================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment