Skip to content

Instantly share code, notes, and snippets.

@wesm
Created October 4, 2012 04:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wesm/3831420 to your computer and use it in GitHub Desktop.
Save wesm/3831420 to your computer and use it in GitHub Desktop.
Parser shootout
# pylint: disable=W0612
import time
import pandas as pd
import numpy as np
import iopro
import gc
def _wikipedia_options(adapter):
adapter.set_field_types({0:object, 1:object, 2:'i4', 3:'i8'})
_iopro_extras = {
'wikipedia' : _wikipedia_options
}
_filenames = {
'zero-matrix': 'zeros.csv',
'double-matrix': 'matrix.csv',
'wikipedia': 'pagecounts-20110331-220000',
'fec': 'P00000001-ALL.csv',
'astro': 'sdss6949386.csv'
}
_delimiters = {
'wikipedia': ' '
}
def pandas_timings(exclude=(), **kwds):
result = {}
for name, path in _filenames.iteritems():
if name in exclude:
continue
print name
delim = _delimiters.get(name, ',')
start = time.time()
table = pd.read_csv(path, delimiter=delim, **kwds)
end = time.time()
result[name] = end - start
print '%s took %.2f sec' % (name, result[name])
table = None
gc.collect()
return result
def iopro_timings(exclude=()):
result = {}
for name, path in _filenames.iteritems():
if name in exclude:
continue
print name
delim = _delimiters.get(name, ',')
start = time.time()
adapter = iopro.text_adapter(path, delimiter=delim)
modifier = _iopro_extras.get(name)
if modifier:
modifier(adapter)
# read full array
table = adapter[:]
end = time.time()
result[name] = end - start
print '%s took %.2f sec' % (name, result[name])
table = None
gc.collect()
return result
r_results = pd.Series({
'zero-matrix': 0.616,
'double-matrix': 6.92,
'astro': 37.03,
'wikipedia': 42.25,
'fec': 18.121
})
results = {
'iopro': iopro_timings(),
'pandas': pandas_timings(),
'R': r_results
}
# system.time(df <- read.csv('parser_examples/zeros.csv', colClasses=rep("integer", 50)))
# user system elapsed
# 0.616 0.004 0.623
# system.time(df <- read.csv('parser_examples/matrix.csv', colClasses=rep("numeric", 10)))
# user system elapsed
# 6.920 0.136 7.071
# system.time(df <- read.csv('parser_examples/sdss6949386.csv', colClasses=rep("numeric", 8)))
# user system elapsed
# 37.030 0.804 37.866
# system.time(df <- read.table('parser_examples/pagecounts-20110331-220000', sep=" ",
# header=F,
# colClasses=c("character", "character", "integer", "numeric")))
# user system elapsed
# 42.250 0.356 42.651
# system.time(df <- read.csv('parser_examples/P00000001-ALL.csv'))
# user system elapsed
# 18.121 0.212 18.350
results = pd.DataFrame(results)
results_norm = results.div(results['pandas'], axis=0)
exclude = ['wikipedia', 'fec']
results = {
'iopro': iopro_timings(exclude=exclude),
'pandas': pandas_timings(na_filter=False, as_recarray=True,
exclude=exclude),
}
results = pd.DataFrame(results)
results_norm = results.div(results['pandas'], axis=0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment