wphicks/benchmark_pandas.py

## benchmark_pandas.py
def benchmark_pandas():
    import timerit
    import pandas as pd
    import numpy as np
    column_data = {
        key: np.random.rand(1000)
        for key in map(chr, range(ord('a'), ord('z')))
    }
    data_frame = pd.DataFrame(column_data)

    print('\n-----')

    for timer in timerit.Timerit(
            100, bestof=10, label='Row Access (PANDAS.loc)'):
        with timer:
            for i in range(len(data_frame)):
                data_frame.loc[i].mean()
    # Timed best=158.0 ms, mean=159.2 ± 0.7 ms for Row Access (PANDAS.loc)

    for timer in timerit.Timerit(
            100, bestof=10, label='Row Access (PANDAS.iloc)'):
        with timer:
            for i in range(len(data_frame)):
                data_frame.iloc[i].mean()
    # Timed best=138.7 ms, mean=139.2 ± 0.43 ms for Row Access (PANDAS.iloc)

    for timer in timerit.Timerit(
            100, bestof=10, label='Row Access (PANDAS.iterrows)'):
        with timer:
            for row in data_frame.iterrows():
                np.mean(row[1])
    # Timed best=86.01 ms, mean=86.67 ± 0.26 ms for Row Access
    # (PANDAS.iterrows)

    for timer in timerit.Timerit(100, bestof=10, label='Row Access (DICT)'):
        with timer:
            np.mean([
                [column_data[k][i] for k in column_data.keys()]
                for i in range(len(data_frame))
            ])
    # Timed best=3.67 ms, mean=3.71 ± 0.022 ms for Row Access (DICT)

    for timer in timerit.Timerit(
            100, bestof=10, label='Row Access (PANDAS.apply)'):
        with timer:
            data_frame.apply(lambda row: row.mean())
    # Timed best=2.151 ms, mean=2.171 ± 0.023 ms for Row Access (PANDAS.apply)

    for timer in timerit.Timerit(
            100, bestof=10, label='Row Access (PANDAS.mean)'):
        with timer:
            data_frame.mean()
    # Timed best=189.2 µs, mean=190.7 ± 1.1 µs for Row Access (PANDAS.mean)

    for timer in timerit.Timerit(
            100, bestof=10, label='Row Access (PANDAS.values)'):
        with timer:
            data_frame.values.mean(0)
    # Timed best=21.55 µs, mean=21.83 ± 0.46 µs for Row Access (PANDAS.values)

    print('\n-----')

    for timer in timerit.Timerit(
            100, bestof=10, label='Column Access (PANDAS)'):
        with timer:
            for k in data_frame.keys():
                data_frame[k].mean()
    # Timed best=1.497 ms, mean=1.507 ± 0.0056 ms for Column Access (PANDAS)

    for timer in timerit.Timerit(100, bestof=10, label='Column Access (DICT)'):
        with timer:
            for k in column_data.keys():
                column_data[k].mean()
    # Timed best=105.0 µs, mean=106.4 ± 0.85 µs for Column Access (DICT)

    for timer in timerit.Timerit(
            100, bestof=10, label='Column Access (PANDAS.apply)'):
        with timer:
            data_frame.apply(lambda col: col.mean(), axis=1)
    # Timed best=62.63 ms, mean=63.06 ± 0.22 ms for Column Access
    # (PANDAS.apply)

    for timer in timerit.Timerit(
            100, bestof=10, label='Column Access (PANDAS.mean)'):
        with timer:
            data_frame.mean(1)
    # Timed best=202.2 µs, mean=208.6 ± 5.8 µs for Column Access (PANDAS.mean)

    for timer in timerit.Timerit(
            100, bestof=10, label='Column Access (PANDAS.values)'):
        with timer:
            data_frame.values.mean(1)
    # Timed best=23.27 µs, mean=24.87 ± 4.4 µs for Column Access
    # (PANDAS.values)


if __name__ == '__main__':
    benchmark_pandas()
	def benchmark_pandas():
	import timerit
	import pandas as pd
	import numpy as np
	column_data = {
	key: np.random.rand(1000)
	for key in map(chr, range(ord('a'), ord('z')))
	}
	data_frame = pd.DataFrame(column_data)

	print('\n-----')

	for timer in timerit.Timerit(
	100, bestof=10, label='Row Access (PANDAS.loc)'):
	with timer:
	for i in range(len(data_frame)):
	data_frame.loc[i].mean()
	# Timed best=158.0 ms, mean=159.2 ± 0.7 ms for Row Access (PANDAS.loc)

	for timer in timerit.Timerit(
	100, bestof=10, label='Row Access (PANDAS.iloc)'):
	with timer:
	for i in range(len(data_frame)):
	data_frame.iloc[i].mean()
	# Timed best=138.7 ms, mean=139.2 ± 0.43 ms for Row Access (PANDAS.iloc)

	for timer in timerit.Timerit(
	100, bestof=10, label='Row Access (PANDAS.iterrows)'):
	with timer:
	for row in data_frame.iterrows():
	np.mean(row[1])
	# Timed best=86.01 ms, mean=86.67 ± 0.26 ms for Row Access
	# (PANDAS.iterrows)

	for timer in timerit.Timerit(100, bestof=10, label='Row Access (DICT)'):
	with timer:
	np.mean([
	[column_data[k][i] for k in column_data.keys()]
	for i in range(len(data_frame))
	])
	# Timed best=3.67 ms, mean=3.71 ± 0.022 ms for Row Access (DICT)

	for timer in timerit.Timerit(
	100, bestof=10, label='Row Access (PANDAS.apply)'):
	with timer:
	data_frame.apply(lambda row: row.mean())
	# Timed best=2.151 ms, mean=2.171 ± 0.023 ms for Row Access (PANDAS.apply)

	for timer in timerit.Timerit(
	100, bestof=10, label='Row Access (PANDAS.mean)'):
	with timer:
	data_frame.mean()
	# Timed best=189.2 µs, mean=190.7 ± 1.1 µs for Row Access (PANDAS.mean)

	for timer in timerit.Timerit(
	100, bestof=10, label='Row Access (PANDAS.values)'):
	with timer:
	data_frame.values.mean(0)
	# Timed best=21.55 µs, mean=21.83 ± 0.46 µs for Row Access (PANDAS.values)

	print('\n-----')

	for timer in timerit.Timerit(
	100, bestof=10, label='Column Access (PANDAS)'):
	with timer:
	for k in data_frame.keys():
	data_frame[k].mean()
	# Timed best=1.497 ms, mean=1.507 ± 0.0056 ms for Column Access (PANDAS)

	for timer in timerit.Timerit(100, bestof=10, label='Column Access (DICT)'):
	with timer:
	for k in column_data.keys():
	column_data[k].mean()
	# Timed best=105.0 µs, mean=106.4 ± 0.85 µs for Column Access (DICT)

	for timer in timerit.Timerit(
	100, bestof=10, label='Column Access (PANDAS.apply)'):
	with timer:
	data_frame.apply(lambda col: col.mean(), axis=1)
	# Timed best=62.63 ms, mean=63.06 ± 0.22 ms for Column Access
	# (PANDAS.apply)

	for timer in timerit.Timerit(
	100, bestof=10, label='Column Access (PANDAS.mean)'):
	with timer:
	data_frame.mean(1)
	# Timed best=202.2 µs, mean=208.6 ± 5.8 µs for Column Access (PANDAS.mean)

	for timer in timerit.Timerit(
	100, bestof=10, label='Column Access (PANDAS.values)'):
	with timer:
	data_frame.values.mean(1)
	# Timed best=23.27 µs, mean=24.87 ± 4.4 µs for Column Access
	# (PANDAS.values)


	if __name__ == '__main__':
	benchmark_pandas()