Skip to content

Instantly share code, notes, and snippets.

@shashi
Last active November 27, 2017 13:03
Show Gist options
  • Save shashi/a95a21f61a5fcf8812f8d0705b4e527f to your computer and use it in GitHub Desktop.
Save shashi/a95a21f61a5fcf8812f8d0705b4e527f to your computer and use it in GitHub Desktop.
JuliaDB benchmarks vs Pandas
using IndexedTables
using PooledArrays
using BenchmarkTools
#key() = randstring(10)
key() = rand()
key1 = [key() for i=1:8000]
key2 = [key() for i=1:8000]
cs = columns(convert(Columns, rand(collect(zip(key1, key2)), 80000)))
#cs = map(PooledArray, cs)
# 80k keys with 8k uniques
t = table(cs..., rand(1:10^6, 80000))
key1tail = [key() for i=1:2000]
key2tail = [key() for i=1:2000]
# 8k keys, 6k are from larger table
cs = (vcat(key1[1:6000], key1tail), vcat(key2[1:6000], key2tail))
smallt = table(cs..., rand(8000))
y=@btime innerjoin(t, smallt, lkey=(1,2), rkey=(1,2), cache=false)
z=@btime outerjoin(t, smallt, lkey=(1,2), rkey=(1,2), cache=false)
x=@btime leftjoin(t, smallt, lkey=(1,2), rkey=(1,2), cache=false)
# original script by Wes McKinney
import random
import gc
import time
from pandas import *
from pandas.compat import range, lrange, StringIO
#from pandas.util.testing import rands
from random import random
N = 10000
ngroups = 10
def get_test_data(ngroups=100, n=N):
unique_groups = lrange(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
# aggregate multiple columns
# df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
# 'key2' : get_test_data(ngroups=ngroups),
# 'data1' : np.random.randn(N),
# 'data2' : np.random.randn(N)})
# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10),
# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10),
# 'value' : np.random.randn(N // 10)})
# result = merge.merge(df, df2, on='key2')
N = 10000
indices = np.array([random() for _ in range(N)], dtype='O')
indices2 = np.array([random() for _ in range(N)], dtype='O')
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
left = DataFrame({'key': key, 'key2': key2,
'value': np.random.randn(80000)})
right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
'value2': np.random.randn(8000)})
right2 = right.append(right, ignore_index=True)
join_methods = ['inner', 'outer', 'left', 'right']
results = DataFrame(index=join_methods, columns=[False, True])
niter = 10
print left
print right
for sort in [False, True]:
for join_method in join_methods:
elapsed = 1.0
print len(left), len(right)
f = lambda: merge(left, right, how=join_method, sort=sort)
gc.disable()
start = time.time()
for _ in range(niter):
f()
elapsed = min(elapsed, (time.time() - start)/niter)
gc.enable()
results[sort][join_method] = elapsed
# results.columns = ['pandas']
results.columns = ['dont_sort', 'sort']
# R results
# many to one
r_results = read_table(StringIO(""" base::merge plyr data.table
inner 0.2475 0.1183 0.1100
outer 0.4213 0.1916 0.2090
left 0.2998 0.1188 0.0572
right 0.3102 0.0536 0.0376
"""), sep='\s+')
presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'})
all_results = presults.join(r_results)
print all_results
all_results = all_results.div(all_results['pandas'], axis=0)
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
'base::merge']]
sort_results = DataFrame.from_items([('pandas', results['sort']),
('R', r_results['base::merge'])])
sort_results['Ratio'] = sort_results['R'] / sort_results['pandas']
nosort_results = DataFrame.from_items([('pandas', results['dont_sort']),
('R', r_results['base::merge'])])
nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas']
# many to many
# many to one
r_results = read_table(StringIO("""base::merge plyr data.table
inner 0.4610 0.1276 0.1269
outer 0.9195 0.1881 0.2725
left 0.6559 0.1257 0.0678
right 0.6425 0.0522 0.0428
"""), sep='\s+')
all_results = presults.join(r_results)
all_results = all_results.div(all_results['pandas'], axis=0)
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
'base::merge']]
print all_results
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"using JuliaDB\n",
"using PyCall\n",
"\n",
"@pyimport pandas as pd\n",
"@pyimport numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataStructures.OrderedDict{Any,Any} with 0 entries"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using DataStructures\n",
"benchmarks = OrderedDict()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"data/sample/yellow_tripdata_2016-01.csv.small.csv\""
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file = \"data/sample/yellow_tripdata_2016-01.csv.small.csv\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Metadata for 1 / 1 files can be loaded from cache.\n"
]
}
],
"source": [
"trips = loadtable([file],);"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"pdtrips = pd.read_csv(file,\n",
" parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n",
");"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataStructures.OrderedDict{Any,Any} with 0 entries"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using BenchmarkTools, DataStructures\n",
"\n",
"benchmarks = OrderedDict()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"using OnlineStats"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 20.82 MiB\n",
" allocs estimate: 283\n",
" --------------\n",
" minimum time: 11.797 ms (0.00% GC)\n",
" median time: 13.190 ms (9.51% GC)\n",
" mean time: 12.974 ms (7.30% GC)\n",
" maximum time: 16.593 ms (8.61% GC)\n",
" --------------\n",
" samples: 386\n",
" evals/sample: 1"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"mean(VendorID) (JuliaDB)\"] = @benchmark groupby(mean, trips, :VendorID, select=:fare_amount)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 1.30 KiB\n",
" allocs estimate: 39\n",
" --------------\n",
" minimum time: 16.487 ms (0.00% GC)\n",
" median time: 17.013 ms (0.00% GC)\n",
" mean time: 17.521 ms (0.00% GC)\n",
" maximum time: 31.922 ms (0.00% GC)\n",
" --------------\n",
" samples: 286\n",
" evals/sample: 1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"mean(VendorID) (pandas)\"] = @benchmark pdtrips[:groupby](\"VendorID\")[\"fare_amount\"][:mean]()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 44.08 MiB\n",
" allocs estimate: 617\n",
" --------------\n",
" minimum time: 47.192 ms (5.60% GC)\n",
" median time: 47.526 ms (5.60% GC)\n",
" mean time: 48.001 ms (5.54% GC)\n",
" maximum time: 58.676 ms (4.76% GC)\n",
" --------------\n",
" samples: 105\n",
" evals/sample: 1"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(dayofweek, passenger_count) (JuliaDB)\"] =\n",
" @benchmark groupby(\n",
" length, trips, (:tpep_pickup_datetime => Dates.dayofweek, :passenger_count),\n",
" select=:fare_amount\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 2.33 KiB\n",
" allocs estimate: 64\n",
" --------------\n",
" minimum time: 83.629 ms (0.00% GC)\n",
" median time: 84.582 ms (0.00% GC)\n",
" mean time: 88.259 ms (0.00% GC)\n",
" maximum time: 118.477 ms (0.00% GC)\n",
" --------------\n",
" samples: 57\n",
" evals/sample: 1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(dayofweek, passenger_count) (pandas)\"] =\n",
" @benchmark pdtrips[:groupby](\n",
" [pdtrips[\"tpep_pickup_datetime\"][:dt][:dayofweek],\n",
" \"passenger_count\"]\n",
" )[\"fare_amount\"][:count]()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 29.31 MiB\n",
" allocs estimate: 426\n",
" --------------\n",
" minimum time: 51.797 ms (3.20% GC)\n",
" median time: 52.686 ms (3.04% GC)\n",
" mean time: 55.009 ms (2.94% GC)\n",
" maximum time: 113.571 ms (3.39% GC)\n",
" --------------\n",
" samples: 91\n",
" evals/sample: 1"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(UDF, passenger_count) (JuliaDB)\"] =\n",
" @benchmark groupby(\n",
" length, trips, (:tpep_pickup_datetime => x->Dates.dayofweek(x) in (1,3,5), :passenger_count),\n",
" select=:fare_amount\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 3.14 KiB\n",
" allocs estimate: 92\n",
" --------------\n",
" minimum time: 280.981 ms (0.00% GC)\n",
" median time: 298.877 ms (0.00% GC)\n",
" mean time: 306.623 ms (0.00% GC)\n",
" maximum time: 378.541 ms (0.00% GC)\n",
" --------------\n",
" samples: 17\n",
" evals/sample: 1"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(UDF, passenger_count) (pandas)\"] =\n",
" @benchmark pdtrips[:groupby](\n",
" [pdtrips[\"tpep_pickup_datetime\"][:dt][:dayofweek][:map](py\"lambda x: x in (0,2,4)\"),\n",
" \"passenger_count\"]\n",
" )[\"fare_amount\"][:count]()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 68.96 MiB\n",
" allocs estimate: 6398\n",
" --------------\n",
" minimum time: 190.741 ms (1.50% GC)\n",
" median time: 439.807 ms (1.21% GC)\n",
" mean time: 411.283 ms (1.47% GC)\n",
" maximum time: 661.178 ms (1.30% GC)\n",
" --------------\n",
" samples: 13\n",
" evals/sample: 1"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(dayofweek, passenger_count, floor) (JuliaDB)\"] =\n",
" @benchmark groupby(\n",
" length, trips, (:tpep_pickup_datetime => Dates.dayofweek,\n",
" :passenger_count,\n",
" :trip_distance => floor,\n",
" ),\n",
" select=:fare_amount\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 3.14 KiB\n",
" allocs estimate: 91\n",
" --------------\n",
" minimum time: 115.429 ms (0.00% GC)\n",
" median time: 134.804 ms (0.00% GC)\n",
" mean time: 151.322 ms (0.00% GC)\n",
" maximum time: 330.770 ms (0.00% GC)\n",
" --------------\n",
" samples: 35\n",
" evals/sample: 1"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(dayofweek, passenger_count, floor) (pandas)\"] =\n",
" @benchmark pdtrips[:groupby]([\n",
" pdtrips[\"tpep_pickup_datetime\"][:dt][:dayofweek],\n",
" \"passenger_count\",\n",
" pdtrips[\"trip_distance\"][:apply](np.floor),\n",
" ])[\"fare_amount\"][:count]()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 73.99 MiB\n",
" allocs estimate: 6392\n",
" --------------\n",
" minimum time: 95.103 ms (3.09% GC)\n",
" median time: 105.681 ms (4.90% GC)\n",
" mean time: 156.297 ms (12.24% GC)\n",
" maximum time: 492.291 ms (54.16% GC)\n",
" --------------\n",
" samples: 32\n",
" evals/sample: 1"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(dayofweek, passenger_count, floor(Int)) (JuliaDB)\"] =\n",
" @benchmark groupby(\n",
" length, trips, (:tpep_pickup_datetime => Dates.dayofweek,\n",
" :passenger_count,\n",
" :trip_distance => x -> floor(Int, x),\n",
" ),\n",
" select=:fare_amount\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"pdindexed1 = pdtrips[:set_index]([\"passenger_count\", pdtrips[\"tpep_pickup_datetime\"][:dt][:dayofweek]]);"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Table with 1363357 rows, 19 columns:\n",
"Columns:\n",
"\u001b[1m# \u001b[22m\u001b[1mcolname \u001b[22m\u001b[1mtype\u001b[22m\n",
"───────────────────────────────────\n",
"1 passenger_count Int64\n",
"2 dayofweek Int64\n",
"3 VendorID Int64\n",
"4 tpep_dropoff_datetime DateTime\n",
"5 trip_distance Float64\n",
"6 pickup_longitude Float64\n",
"7 pickup_latitude Float64\n",
"8 RatecodeID Int64\n",
"9 store_and_fwd_flag String\n",
"10 dropoff_longitude Float64\n",
"11 dropoff_latitude Float64\n",
"12 payment_type Int64\n",
"13 fare_amount Float64\n",
"14 extra Float64\n",
"15 mta_tax Float64\n",
"16 tip_amount Float64\n",
"17 tolls_amount Float64\n",
"18 improvement_surcharge Float64\n",
"19 total_amount Float64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indexed1 = reindex(trips, (:passenger_count, :dayofweek => :tpep_pickup_datetime => Dates.dayofweek))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 65.11 KiB\n",
" allocs estimate: 1250\n",
" --------------\n",
" minimum time: 5.207 ms (0.00% GC)\n",
" median time: 6.591 ms (0.00% GC)\n",
" mean time: 8.776 ms (0.06% GC)\n",
" maximum time: 24.250 ms (0.00% GC)\n",
" --------------\n",
" samples: 569\n",
" evals/sample: 1"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(passenger_count, dayofweek) (indexed JuliaDB)\"] =\n",
" @benchmark groupby(length, indexed1)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 2.80 KiB\n",
" allocs estimate: 58\n",
" --------------\n",
" minimum time: 51.726 ms (0.00% GC)\n",
" median time: 58.241 ms (0.00% GC)\n",
" mean time: 69.100 ms (0.00% GC)\n",
" maximum time: 193.826 ms (0.00% GC)\n",
" --------------\n",
" samples: 73\n",
" evals/sample: 1"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"count(passenger_count, dayofweek) (indexed pandas)\"] =\n",
" @benchmark pdindexed1[:groupby](level=[\"passenger_count\", \"tpep_pickup_datetime\"])[\"fare_amount\"][:count]()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataStructures.OrderedDict{Any,Any} with 11 entries:\n",
" \"mean(VendorID) (JuliaDB)\" => Trial(11.797 ms)\n",
" \"mean(VendorID) (pandas)\" => Trial(16.487 ms)\n",
" \"count(dayofweek, passenger_count) (JuliaDB)\" => Trial(47.192 ms)\n",
" \"count(dayofweek, passenger_count) (pandas)\" => Trial(83.629 ms)\n",
" \"count(UDF, passenger_count) (JuliaDB)\" => Trial(51.797 ms)\n",
" \"count(UDF, passenger_count) (pandas)\" => Trial(280.981 ms)\n",
" \"count(dayofweek, passenger_count, floor) (JuliaDB)\" => Trial(190.741 ms)\n",
" \"count(dayofweek, passenger_count, floor) (pandas)\" => Trial(115.429 ms)\n",
" \"count(dayofweek, passenger_count, floor(Int)) (JuliaDB)\" => Trial(95.103 ms)\n",
" \"count(passenger_count, dayofweek) (indexed JuliaDB)\" => Trial(5.207 ms)\n",
" \"count(passenger_count, dayofweek) (indexed pandas)\" => Trial(51.726 ms)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 0.6.0",
"language": "julia",
"name": "julia-0.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "0.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment