Skip to content

Instantly share code, notes, and snippets.

@shashi
Last active November 27, 2017 13:03
Show Gist options
  • Save shashi/a95a21f61a5fcf8812f8d0705b4e527f to your computer and use it in GitHub Desktop.
Save shashi/a95a21f61a5fcf8812f8d0705b4e527f to your computer and use it in GitHub Desktop.
JuliaDB benchmarks vs Pandas
using IndexedTables
using PooledArrays
using BenchmarkTools
#key() = randstring(10)
key() = rand()
key1 = [key() for i=1:8000]
key2 = [key() for i=1:8000]
cs = columns(convert(Columns, rand(collect(zip(key1, key2)), 80000)))
#cs = map(PooledArray, cs)
# 80k keys with 8k uniques
t = table(cs..., rand(1:10^6, 80000))
key1tail = [key() for i=1:2000]
key2tail = [key() for i=1:2000]
# 8k keys, 6k are from larger table
cs = (vcat(key1[1:6000], key1tail), vcat(key2[1:6000], key2tail))
smallt = table(cs..., rand(8000))
y=@btime innerjoin(t, smallt, lkey=(1,2), rkey=(1,2), cache=false)
z=@btime outerjoin(t, smallt, lkey=(1,2), rkey=(1,2), cache=false)
x=@btime leftjoin(t, smallt, lkey=(1,2), rkey=(1,2), cache=false)
# original script by Wes McKinney
import random
import gc
import time
from pandas import *
from pandas.compat import range, lrange, StringIO
#from pandas.util.testing import rands
from random import random
N = 10000
ngroups = 10
def get_test_data(ngroups=100, n=N):
unique_groups = lrange(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
# aggregate multiple columns
# df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
# 'key2' : get_test_data(ngroups=ngroups),
# 'data1' : np.random.randn(N),
# 'data2' : np.random.randn(N)})
# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10),
# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10),
# 'value' : np.random.randn(N // 10)})
# result = merge.merge(df, df2, on='key2')
N = 10000
indices = np.array([random() for _ in range(N)], dtype='O')
indices2 = np.array([random() for _ in range(N)], dtype='O')
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
left = DataFrame({'key': key, 'key2': key2,
'value': np.random.randn(80000)})
right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
'value2': np.random.randn(8000)})
right2 = right.append(right, ignore_index=True)
join_methods = ['inner', 'outer', 'left', 'right']
results = DataFrame(index=join_methods, columns=[False, True])
niter = 10
print left
print right
for sort in [False, True]:
for join_method in join_methods:
elapsed = 1.0
print len(left), len(right)
f = lambda: merge(left, right, how=join_method, sort=sort)
gc.disable()
start = time.time()
for _ in range(niter):
f()
elapsed = min(elapsed, (time.time() - start)/niter)
gc.enable()
results[sort][join_method] = elapsed
# results.columns = ['pandas']
results.columns = ['dont_sort', 'sort']
# R results
# many to one
r_results = read_table(StringIO(""" base::merge plyr data.table
inner 0.2475 0.1183 0.1100
outer 0.4213 0.1916 0.2090
left 0.2998 0.1188 0.0572
right 0.3102 0.0536 0.0376
"""), sep='\s+')
presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'})
all_results = presults.join(r_results)
print all_results
all_results = all_results.div(all_results['pandas'], axis=0)
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
'base::merge']]
sort_results = DataFrame.from_items([('pandas', results['sort']),
('R', r_results['base::merge'])])
sort_results['Ratio'] = sort_results['R'] / sort_results['pandas']
nosort_results = DataFrame.from_items([('pandas', results['dont_sort']),
('R', r_results['base::merge'])])
nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas']
# many to many
# many to one
r_results = read_table(StringIO("""base::merge plyr data.table
inner 0.4610 0.1276 0.1269
outer 0.9195 0.1881 0.2725
left 0.6559 0.1257 0.0678
right 0.6425 0.0522 0.0428
"""), sep='\s+')
all_results = presults.join(r_results)
all_results = all_results.div(all_results['pandas'], axis=0)
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
'base::merge']]
print all_results
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Benchmarks\n",
"\n",
"`loadtable` vs `pandas.read_csv`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n",
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n",
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n",
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n"
]
},
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 6.81 GiB\n",
" allocs estimate: 32738330\n",
" --------------\n",
" minimum time: 16.809 s (7.50% GC)\n",
" median time: 16.809 s (7.50% GC)\n",
" mean time: 16.809 s (7.50% GC)\n",
" maximum time: 16.809 s (7.50% GC)\n",
" --------------\n",
" samples: 1\n",
" evals/sample: 1"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using JuliaDB, BenchmarkTools\n",
"\n",
"benchmarks = Dict()\n",
"\n",
"benchmarks[\"JuliaDB.loadtable (DateTime)\"] =\n",
" @benchmark loadtable([\"data/yellow_tripdata_2016-01.csv\"],\n",
" usecache=false)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"using PyCall\n",
"@pyimport pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 2.36 KiB\n",
" allocs estimate: 53\n",
" --------------\n",
" minimum time: 32.873 s (0.00% GC)\n",
" median time: 32.873 s (0.00% GC)\n",
" mean time: 32.873 s (0.00% GC)\n",
" maximum time: 32.873 s (0.00% GC)\n",
" --------------\n",
" samples: 1\n",
" evals/sample: 1"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"pandas.read_csv (DateTime)\"] =\n",
" @benchmark pd.read_csv(\n",
" \"data/yellow_tripdata_2016-01.csv\",\n",
" parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"By default pandas reads datetime fields as string\n",
"\n",
"```julia\n",
"julia> df = pd.read_csv(\"data/yellow_tripdata_2016-01.csv\")\n",
"julia> typeof(df[:tpep_pickup_datetime][1])\n",
"String\n",
"```\n",
"\n",
"But you can specify `parse_dates` to parse them as dates\n",
"```julia\n",
"julia> df2 = pd.read_csv(\"data/yellow_tripdata_2016-01.csv\", parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]);\n",
"julia> typeof(df2[:tpep_pickup_datetime][1])\n",
"DateTime\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 528 bytes\n",
" allocs estimate: 18\n",
" --------------\n",
" minimum time: 29.649 s (0.00% GC)\n",
" median time: 29.649 s (0.00% GC)\n",
" mean time: 29.649 s (0.00% GC)\n",
" maximum time: 29.649 s (0.00% GC)\n",
" --------------\n",
" samples: 1\n",
" evals/sample: 1"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"pandas.read_csv (string)\"] = @benchmark pd.read_csv(\"data/yellow_tripdata_2016-01.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n",
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n",
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n",
"Metadata for 0 / 1 files can be loaded from cache.\n",
"Reading 1 csv files totalling 1.591 GiB in 1 batches...\n"
]
},
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 7.14 GiB\n",
" allocs estimate: 32742077\n",
" --------------\n",
" minimum time: 18.083 s (13.41% GC)\n",
" median time: 18.083 s (13.41% GC)\n",
" mean time: 18.083 s (13.41% GC)\n",
" maximum time: 18.083 s (13.41% GC)\n",
" --------------\n",
" samples: 1\n",
" evals/sample: 1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"JuliaDB.loadtable (string)\"] = @benchmark loadtable(\n",
" [\"data/yellow_tripdata_2016-01.csv\"],\n",
" usecache=false,\n",
" colparsers=Dict(2=>String, 3=>String)\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Results:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dict{Any,Any} with 1 entry:\n",
" \"JuliaDB.loadtable (string)\" => Trial(18.083 s)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 6.81 GiB\n",
" allocs estimate: 32738330\n",
" --------------\n",
" minimum time: 16.809 s (7.50% GC)\n",
" median time: 16.809 s (7.50% GC)\n",
" mean time: 16.809 s (7.50% GC)\n",
" maximum time: 16.809 s (7.50% GC)\n",
" --------------\n",
" samples: 1\n",
" evals/sample: 1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"JuliaDB.loadtable (DateTime)\"] = Out[1]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BenchmarkTools.Trial: \n",
" memory estimate: 528 bytes\n",
" allocs estimate: 18\n",
" --------------\n",
" minimum time: 29.649 s (0.00% GC)\n",
" median time: 29.649 s (0.00% GC)\n",
" mean time: 29.649 s (0.00% GC)\n",
" maximum time: 29.649 s (0.00% GC)\n",
" --------------\n",
" samples: 1\n",
" evals/sample: 1"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks[\"pandas.read_csv (string)\"] = Out[4]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dict{Any,Any} with 4 entries:\n",
" \"JuliaDB.loadtable (DateTime)\" => Trial(16.809 s)\n",
" \"pandas.read_csv (string)\" => Trial(29.649 s)\n",
" \"JuliaDB.loadtable (string)\" => Trial(18.083 s)\n",
" \"pandas.read_csv (DateTime)\" => Trial(32.873 s)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Table with 4 rows, 2 columns:\n",
"benchmark result\n",
"─────────────────────────────────────\n",
"JuliaDB.loadtable (DateTime) 16.8086\n",
"pandas.read_csv (DateTime) 32.8733\n",
"JuliaDB.loadtable (string) 18.0826\n",
"pandas.read_csv (string) 29.6489"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ks = collect(keys(benchmarks))[[1,4,3,2]]\n",
"vs = map(k->benchmarks[k], ks)\n",
"\n",
"tbl = table(map(Text, ks), map(x->minimum(x).time/10e8, vs), names=[:benchmark, :result])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 0.6.0",
"language": "julia",
"name": "julia-0.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "0.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment