Skip to content

Instantly share code, notes, and snippets.

@danbirken
Created January 21, 2014 02:05
Show Gist options
  • Save danbirken/8533199 to your computer and use it in GitHub Desktop.
Save danbirken/8533199 to your computer and use it in GitHub Desktop.
Timing to_datetime():
Datetime format: %m-%d-%Y
---------------
Without infer_format: cad7e6333f1cd2ebe272d7fc7553cd27 - 3.082s
With infer_format: cad7e6333f1cd2ebe272d7fc7553cd27 - 0.268s (11.51x baseline)
Passing the format: cad7e6333f1cd2ebe272d7fc7553cd27 - 0.267s (11.54x baseline)
Datetime format: %m/%d/%Y %H:%M:%S.%f
---------------
Without infer_format: ec2ef27adb2e95e2c5386d8d5d8513a1 - 5.162s
With infer_format: ec2ef27adb2e95e2c5386d8d5d8513a1 - 0.501s (10.31x baseline)
Passing the format: ec2ef27adb2e95e2c5386d8d5d8513a1 - 0.509s (10.14x baseline)
Datetime format: %Y-%m-%dT%H:%M:%S.%f
---------------
Without infer_format: ec2ef27adb2e95e2c5386d8d5d8513a1 - 0.013s
With infer_format: ec2ef27adb2e95e2c5386d8d5d8513a1 - 0.012s (1.06x baseline)
Passing the format: ec2ef27adb2e95e2c5386d8d5d8513a1 - 0.491s (0.03x baseline)
Testing reading CSV:
Datetime format: %m-%d-%Y
---------------
Without infer_format: e8bde3ae42c769a7509cb94e9f36ca5e - 3.085s
With infer_format: e8bde3ae42c769a7509cb94e9f36ca5e - 0.298s (10.37x baseline)
With strptime date_parser: e8bde3ae42c769a7509cb94e9f36ca5e - 0.906s (3.41x baseline)
Datetime format: %m/%d/%Y %H:%M:%S.%f
---------------
Without infer_format: 2f7019b7f7795146b4de4e83fbc0ebb5 - 5.230s
With infer_format: 2f7019b7f7795146b4de4e83fbc0ebb5 - 0.555s (9.42x baseline)
With strptime date_parser: 2f7019b7f7795146b4de4e83fbc0ebb5 - 1.245s (4.20x baseline)
Datetime format: %Y-%m-%dT%H:%M:%S.%f
---------------
Without infer_format: f0028f10cf5d2fc66900e1a57d5bbf9a - 0.073s
With infer_format: f0028f10cf5d2fc66900e1a57d5bbf9a - 0.062s (1.19x baseline)
With strptime date_parser: f0028f10cf5d2fc66900e1a57d5bbf9a - 1.249s (0.06x baseline)
import datetime
import hashlib
import time
import pandas as pd
def time_with_hash(desc, f, baseline=None):
start = time.time()
output = f()
timing = time.time() - start
if baseline is None:
print '%30s: %s - %.3fs' % (
desc,
hashlib.md5(output.to_json()).hexdigest(),
timing
)
else:
print '%30s: %s - %.3fs (%.2fx baseline)' % (
desc,
hashlib.md5(output.to_json()).hexdigest(),
timing,
baseline / timing
)
return timing
test_formats = [
'%m-%d-%Y',
'%m/%d/%Y %H:%M:%S.%f',
'%Y-%m-%dT%H:%M:%S.%f',
]
print 'Timing to_datetime():'
for test_format in test_formats:
s = (
pd
.Series(pd.date_range('20000101', periods=50000, freq='H'))
.apply(lambda x: x.strftime(test_format))
)
print 'Datetime format:', test_format
print '---------------'
baseline = time_with_hash('Without infer_format', lambda: pd.to_datetime(s))
time_with_hash(
'With infer_format',
lambda: pd.to_datetime(s, infer_format=True),
baseline=baseline
)
time_with_hash(
'Passing the format',
lambda: pd.to_datetime(s, format=test_format),
baseline=baseline
)
print
print
print 'Testing reading CSV:'
for test_format in test_formats:
s = (
pd
.Series(pd.date_range('20000101', periods=50000, freq='H'))
.apply(lambda x: x.strftime(test_format))
)
s.to_csv('/tmp/test.csv')
def date_parser(s):
return datetime.datetime.strptime(s, test_format)
print 'Datetime format:', test_format
print '---------------'
baseline = time_with_hash('Without infer_format',
lambda: pd.read_csv(
'/tmp/test.csv',
parse_dates=[1,],
)
)
time_with_hash('With infer_format',
lambda: pd.read_csv(
'/tmp/test.csv',
parse_dates=[1,],
infer_datetime_format=True
),
baseline=baseline
)
time_with_hash('With strptime date_parser',
lambda: pd.read_csv(
'/tmp/test.csv',
parse_dates=[1,],
date_parser=date_parser,
),
baseline=baseline
)
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment