Skip to content

Instantly share code, notes, and snippets.

@altaurog
Last active January 29, 2016 11:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save altaurog/ed3a9583c17ac737c758 to your computer and use it in GitHub Desktop.
Save altaurog/ed3a9583c17ac737c758 to your computer and use it in GitHub Desktop.
"""
Performance benchmark: pandas DataFrame.pivot against crosstab
"""
from datetime import datetime, timedelta
import io
import random
import sys
import time
import pandas as pd
import psycopg2
CLIENT_COUNT = 80000
DATE_COUNT = 120
def main(dbstr):
connection = psycopg2.connect(dbstr)
dates = generate_dates(datetime(2015, 1, 1), DATE_COUNT)
setup_database(connection, dates)
print('crosstab: %.1f' % with_crosstab(connection, dates))
connection.rollback()
setup_database(connection, dates)
print('pandas: %0.1f / %0.1f' % with_pandas(connection))
def with_crosstab(connection, dates):
values = ','.join("('%s'::date)" % d for d in dates)
columns = ','.join('"day%s" int' % d for d in dates)
query = """
SELECT * from crosstab (
'SELECT client, datestamp, count(*)
FROM mytable
GROUP BY client, datestamp
ORDER BY client, datestamp',
$$VALUES %s$$) AS ct ("Client" int, %s)
""" % (values, columns)
cur = connection.cursor()
start = time.time()
cur.execute(query)
data = cur.fetchall()
return time.time() - start
def with_pandas(connection):
query = """
SELECT client, datestamp, count(*)
FROM mytable
GROUP BY client, datestamp
ORDER BY client, datestamp
"""
a = time.time()
cur = connection.cursor()
cur.execute(query)
data = cur.fetchall()
b = time.time()
df = pd.DataFrame(data, columns=['client', 'datestamp', 'count'])
data = df.pivot(index='client', columns='datestamp', values='count')
return b - a, time.time() - b
def setup_database(connection, dates):
cur = connection.cursor()
cur.execute("""
CREATE TEMP TABLE mytable (
client int,
datestamp date,
amount int
) ON COMMIT DROP
""")
buf = io.StringIO('\n'.join(generate_data(CLIENT_COUNT, dates)))
cur.copy_from(buf, 'mytable')
cur.execute("CREATE INDEX ON mytable (client, datestamp)")
cur.close()
def generate_data(num_clients, dates):
for i in range(num_clients):
for d in dates:
for x in range(random.randint(0,10)):
yield '%d\t%s\t1' % (i, d)
def generate_dates(startdate, count):
return [startdate + timedelta(d) for d in range(count)]
if __name__ == '__main__':
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment