tdeboissiere/pandas_speed_checks.py

## pandas_speed_checks.py
import pandas as pd
import numpy as np
from tsfresh.feature_extraction import feature_calculators as fc
from time import time


def count_above_mean(x):

    x = np.asarray(x)
    m = np.mean(x)
    return np.where(x > m)[0].shape[0]


def count_below_mean(x):

    x = np.asarray(x)
    m = np.mean(x)
    return np.where(x < m)[0].shape[0]


def percentage_of_reoccurring_datapoints_to_all_datapoints(x):

    unique, counts = np.unique(x, return_counts=True)
    counts[counts < 2] = 0
    counts[counts > 0] = 1
    return np.sum(counts) / float(counts.shape[0])


def sum_of_reoccurring_values(x):

    unique, counts = np.unique(x, return_counts=True)
    counts[counts < 2] = 0
    return np.sum(counts * unique)


def build_df(num_samples_per_id, num_id, num_cols):

    data = np.zeros((num_id * num_samples_per_id, num_cols))

    for i, col in enumerate(range(num_cols)):
        r = np.random.randint(0,2)
        if r == 0:
            data[:, i] = np.random.uniform(0, 1, data.shape[0])
        else:
            data[:, i] = np.random.randint(0, 10, data.shape[0])

    df = pd.DataFrame(data=data, columns=["col%s" % i for i in range(num_cols)])
    df["ID"] = np.random.randint(0, num_id, num_id * num_samples_per_id)

    return df


def time_groupby(g, agg_func, flag, previous_time=None):

    s = time()
    out = g.agg(agg_func)
    t = time() - s
    speedup = None
    if previous_time:
        speedup = previous_time / t
    if speedup is None:
        print "Time %s %s: %.3g" % (agg_func.__name__, flag, time() - s)
    else:
        print "Time %s %s: %.3g (speedup: %.3gx)" % (agg_func.__name__, flag, time() - s, speedup)
    return out, t


def test_speed(df):

    df_group = df.groupby("ID")

    list_func_tsfresh = [fc.count_above_mean, fc.count_below_mean,
                         fc.percentage_of_reoccurring_datapoints_to_all_datapoints,
                         fc.sum_of_reoccurring_values]

    list_func = [count_above_mean, count_below_mean,
                 percentage_of_reoccurring_datapoints_to_all_datapoints,
                 sum_of_reoccurring_values]

    for f1, f2 in zip(list_func_tsfresh, list_func):

        print("")
        df1, t1 = time_groupby(df_group, f1, "tsfresh")
        df2, t2 = time_groupby(df_group, f2, "proposed", previous_time=t1)
        for c in df1.columns.values:
            assert np.all(np.isclose(df1[c].values, df2[c].values))


if __name__ == '__main__':

    df = build_df(10000, 20, 30)
    test_speed(df)
	import pandas as pd
	import numpy as np
	from tsfresh.feature_extraction import feature_calculators as fc
	from time import time


	def count_above_mean(x):

	x = np.asarray(x)
	m = np.mean(x)
	return np.where(x > m)[0].shape[0]


	def count_below_mean(x):

	x = np.asarray(x)
	m = np.mean(x)
	return np.where(x < m)[0].shape[0]


	def percentage_of_reoccurring_datapoints_to_all_datapoints(x):

	unique, counts = np.unique(x, return_counts=True)
	counts[counts < 2] = 0
	counts[counts > 0] = 1
	return np.sum(counts) / float(counts.shape[0])


	def sum_of_reoccurring_values(x):

	unique, counts = np.unique(x, return_counts=True)
	counts[counts < 2] = 0
	return np.sum(counts * unique)


	def build_df(num_samples_per_id, num_id, num_cols):

	data = np.zeros((num_id * num_samples_per_id, num_cols))

	for i, col in enumerate(range(num_cols)):
	r = np.random.randint(0,2)
	if r == 0:
	data[:, i] = np.random.uniform(0, 1, data.shape[0])
	else:
	data[:, i] = np.random.randint(0, 10, data.shape[0])

	df = pd.DataFrame(data=data, columns=["col%s" % i for i in range(num_cols)])
	df["ID"] = np.random.randint(0, num_id, num_id * num_samples_per_id)

	return df


	def time_groupby(g, agg_func, flag, previous_time=None):

	s = time()
	out = g.agg(agg_func)
	t = time() - s
	speedup = None
	if previous_time:
	speedup = previous_time / t
	if speedup is None:
	print "Time %s %s: %.3g" % (agg_func.__name__, flag, time() - s)
	else:
	print "Time %s %s: %.3g (speedup: %.3gx)" % (agg_func.__name__, flag, time() - s, speedup)
	return out, t


	def test_speed(df):

	df_group = df.groupby("ID")

	list_func_tsfresh = [fc.count_above_mean, fc.count_below_mean,
	fc.percentage_of_reoccurring_datapoints_to_all_datapoints,
	fc.sum_of_reoccurring_values]

	list_func = [count_above_mean, count_below_mean,
	percentage_of_reoccurring_datapoints_to_all_datapoints,
	sum_of_reoccurring_values]

	for f1, f2 in zip(list_func_tsfresh, list_func):

	print("")
	df1, t1 = time_groupby(df_group, f1, "tsfresh")
	df2, t2 = time_groupby(df_group, f2, "proposed", previous_time=t1)
	for c in df1.columns.values:
	assert np.all(np.isclose(df1[c].values, df2[c].values))


	if __name__ == '__main__':

	df = build_df(10000, 20, 30)
	test_speed(df)