Skip to content

Instantly share code, notes, and snippets.

@tdeboissiere
Last active January 25, 2017 01:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tdeboissiere/def451b1e5e56c1fd8c8c133d7ffa8dc to your computer and use it in GitHub Desktop.
Save tdeboissiere/def451b1e5e56c1fd8c8c133d7ffa8dc to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from tsfresh.feature_extraction import feature_calculators as fc
from time import time
def count_above_mean(x):
x = np.asarray(x)
m = np.mean(x)
return np.where(x > m)[0].shape[0]
def count_below_mean(x):
x = np.asarray(x)
m = np.mean(x)
return np.where(x < m)[0].shape[0]
def percentage_of_reoccurring_datapoints_to_all_datapoints(x):
unique, counts = np.unique(x, return_counts=True)
counts[counts < 2] = 0
counts[counts > 0] = 1
return np.sum(counts) / float(counts.shape[0])
def sum_of_reoccurring_values(x):
unique, counts = np.unique(x, return_counts=True)
counts[counts < 2] = 0
return np.sum(counts * unique)
def build_df(num_samples_per_id, num_id, num_cols):
data = np.zeros((num_id * num_samples_per_id, num_cols))
for i, col in enumerate(range(num_cols)):
r = np.random.randint(0,2)
if r == 0:
data[:, i] = np.random.uniform(0, 1, data.shape[0])
else:
data[:, i] = np.random.randint(0, 10, data.shape[0])
df = pd.DataFrame(data=data, columns=["col%s" % i for i in range(num_cols)])
df["ID"] = np.random.randint(0, num_id, num_id * num_samples_per_id)
return df
def time_groupby(g, agg_func, flag, previous_time=None):
s = time()
out = g.agg(agg_func)
t = time() - s
speedup = None
if previous_time:
speedup = previous_time / t
if speedup is None:
print "Time %s %s: %.3g" % (agg_func.__name__, flag, time() - s)
else:
print "Time %s %s: %.3g (speedup: %.3gx)" % (agg_func.__name__, flag, time() - s, speedup)
return out, t
def test_speed(df):
df_group = df.groupby("ID")
list_func_tsfresh = [fc.count_above_mean, fc.count_below_mean,
fc.percentage_of_reoccurring_datapoints_to_all_datapoints,
fc.sum_of_reoccurring_values]
list_func = [count_above_mean, count_below_mean,
percentage_of_reoccurring_datapoints_to_all_datapoints,
sum_of_reoccurring_values]
for f1, f2 in zip(list_func_tsfresh, list_func):
print("")
df1, t1 = time_groupby(df_group, f1, "tsfresh")
df2, t2 = time_groupby(df_group, f2, "proposed", previous_time=t1)
for c in df1.columns.values:
assert np.all(np.isclose(df1[c].values, df2[c].values))
if __name__ == '__main__':
df = build_df(10000, 20, 30)
test_speed(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment