Last active
January 25, 2017 01:21
-
-
Save tdeboissiere/def451b1e5e56c1fd8c8c133d7ffa8dc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from tsfresh.feature_extraction import feature_calculators as fc | |
from time import time | |
def count_above_mean(x): | |
x = np.asarray(x) | |
m = np.mean(x) | |
return np.where(x > m)[0].shape[0] | |
def count_below_mean(x): | |
x = np.asarray(x) | |
m = np.mean(x) | |
return np.where(x < m)[0].shape[0] | |
def percentage_of_reoccurring_datapoints_to_all_datapoints(x): | |
unique, counts = np.unique(x, return_counts=True) | |
counts[counts < 2] = 0 | |
counts[counts > 0] = 1 | |
return np.sum(counts) / float(counts.shape[0]) | |
def sum_of_reoccurring_values(x): | |
unique, counts = np.unique(x, return_counts=True) | |
counts[counts < 2] = 0 | |
return np.sum(counts * unique) | |
def build_df(num_samples_per_id, num_id, num_cols): | |
data = np.zeros((num_id * num_samples_per_id, num_cols)) | |
for i, col in enumerate(range(num_cols)): | |
r = np.random.randint(0,2) | |
if r == 0: | |
data[:, i] = np.random.uniform(0, 1, data.shape[0]) | |
else: | |
data[:, i] = np.random.randint(0, 10, data.shape[0]) | |
df = pd.DataFrame(data=data, columns=["col%s" % i for i in range(num_cols)]) | |
df["ID"] = np.random.randint(0, num_id, num_id * num_samples_per_id) | |
return df | |
def time_groupby(g, agg_func, flag, previous_time=None): | |
s = time() | |
out = g.agg(agg_func) | |
t = time() - s | |
speedup = None | |
if previous_time: | |
speedup = previous_time / t | |
if speedup is None: | |
print "Time %s %s: %.3g" % (agg_func.__name__, flag, time() - s) | |
else: | |
print "Time %s %s: %.3g (speedup: %.3gx)" % (agg_func.__name__, flag, time() - s, speedup) | |
return out, t | |
def test_speed(df): | |
df_group = df.groupby("ID") | |
list_func_tsfresh = [fc.count_above_mean, fc.count_below_mean, | |
fc.percentage_of_reoccurring_datapoints_to_all_datapoints, | |
fc.sum_of_reoccurring_values] | |
list_func = [count_above_mean, count_below_mean, | |
percentage_of_reoccurring_datapoints_to_all_datapoints, | |
sum_of_reoccurring_values] | |
for f1, f2 in zip(list_func_tsfresh, list_func): | |
print("") | |
df1, t1 = time_groupby(df_group, f1, "tsfresh") | |
df2, t2 = time_groupby(df_group, f2, "proposed", previous_time=t1) | |
for c in df1.columns.values: | |
assert np.all(np.isclose(df1[c].values, df2[c].values)) | |
if __name__ == '__main__': | |
df = build_df(10000, 20, 30) | |
test_speed(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment