Skip to content

Instantly share code, notes, and snippets.

@jmayse
Last active July 18, 2019 15:47
Show Gist options
  • Save jmayse/7c76a2d838ac164b923a47b29527f2ed to your computer and use it in GitHub Desktop.
Save jmayse/7c76a2d838ac164b923a47b29527f2ed to your computer and use it in GitHub Desktop.
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.3.0 -- An enhanced Interactive Python. Type '?' for help.
PyDev console: using IPython 7.3.0
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
[Clang 10.0.1 (clang-1001.0.46.3)] on darwin
import pandas as pd
import numpy as np
df = pd.read_csv('/Projects/drink_recommendation_engine/sample_2.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
from mlxtend.frequent_patterns.fpgrowth import fpgrowth
print("fpgrowth: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("fpgrowth(items, min_support=0.01)", globals=globals(), number=1, repeat=5)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
print("apriori: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("apriori(items, min_support=0.01)", globals=globals(), number=1, repeat=5)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
df1 = fpgrowth(items, min_support=0.01)
df2 = apriori(items, min_support=0.01)
df3 = pd.merge(df1, df2, left_on=['itemsets'], right_on=['itemsets'])
s1 = df3['support_x']
s2 = df3['support_y']
s1.name = 'support'
s2.name = 'support'
if not pd.testing.assert_series_equal(s1, s2): print("Assertion passed")
fpgrowth: 8676 transactions, 966 unique items
fastest run: 0.24607628100000056, slowest run: 0.2710467320000012, mean: 0.25534826600000055
apriori: 8676 transactions, 966 unique items
fastest run: 0.8877421420000005, slowest run: 0.9147161260000001, mean: 0.8998771082000001
Assertion passed
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.3.0 -- An enhanced Interactive Python. Type '?' for help.
PyDev console: using IPython 7.3.0
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
[Clang 10.0.1 (clang-1001.0.46.3)] on darwin
import pandas as pd
import numpy as np
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
from mlxtend.frequent_patterns.fpgrowth import fpgrowth
print("fpgrowth: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("fpgrowth(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
print("apriori: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
df1 = fpgrowth(items, min_support=0.01)
df2 = apriori(items, min_support=0.01)
df3 = pd.merge(df1, df2, left_on=['itemsets'], right_on=['itemsets'])
s1 = df3['support_x']
s2 = df3['support_y']
s1.name = 'support'
s2.name = 'support'
if not pd.testing.assert_series_equal(s1, s2): print("Assertion passed")
fpgrowth: 1069 transactions, 244 unique items
fastest run: 0.2882803219999994, slowest run: 0.31644036000000053, mean: 0.30230184899999984
apriori: 1069 transactions, 244 unique items
fastest run: 0.33422621099999983, slowest run: 0.35184302599999917, mean: 0.34135044549999993
Assertion passed
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.3.0 -- An enhanced Interactive Python. Type '?' for help.
PyDev console: using IPython 7.3.0
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
[Clang 10.0.1 (clang-1001.0.46.3)] on darwin
import pandas as pd
import numpy as np
df = pd.read_csv('/Projects/drink_recommendation_engine/sample_2.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
from mlxtend.frequent_patterns.fpgrowth import fpgrowth
print("fpgrowth: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("fpgrowth(items, min_support=0.01)", globals=globals(), number=1, repeat=5)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
print("apriori: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("apriori(items, min_support=0.01)", globals=globals(), number=1, repeat=5)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
df1 = fpgrowth(items, min_support=0.01)
df2 = apriori(items, min_support=0.01)
df3 = pd.merge(df1, df2, left_on=['itemsets'], right_on=['itemsets'])
s1 = df3['support_x']
s2 = df3['support_y']
s1.name = 'support'
s2.name = 'support'
if not pd.testing.assert_series_equal(s1, s2): print("Assertion passed")
fpgrowth: 8676 transactions, 966 unique items
fastest run: 0.2531228709999995, slowest run: 0.2903997089999999, mean: 0.2693362885999999
apriori: 8676 transactions, 966 unique items
fastest run: 0.8125187799999996, slowest run: 0.9776996000000002, mean: 0.8516161937999998
Assertion passed
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.3.0 -- An enhanced Interactive Python. Type '?' for help.
PyDev console: using IPython 7.3.0
Python 3.7.3 (default, Mar 27 2019, 09:23:15)
[Clang 10.0.1 (clang-1001.0.46.3)] on darwin
import pandas as pd
import numpy as np
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
from mlxtend.frequent_patterns.fpgrowth import fpgrowth
print("fpgrowth: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("fpgrowth(items, min_support=0.01)", globals=globals(), number=1, repeat=5)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
print("apriori: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("apriori(items, min_support=0.01)", globals=globals(), number=1, repeat=5)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
df1 = fpgrowth(items, min_support=0.01)
df2 = apriori(items, min_support=0.01)
df3 = pd.merge(df1, df2, left_on=['itemsets'], right_on=['itemsets'])
s1 = df3['support_x']
s2 = df3['support_y']
s1.name = 'support'
s2.name = 'support'
if not pd.testing.assert_series_equal(s1, s2): print("Assertion passed")
fpgrowth: 1069 transactions, 244 unique items
fastest run: 0.11113154099999978, slowest run: 0.11296855999999966, mean: 0.11202025359999972
apriori: 1069 transactions, 244 unique items
fastest run: 0.32792389399999955, slowest run: 0.39672938999999996, mean: 0.3459437119999995
Assertion passed
# With 0.05 min_support, new apriori is faster
import pandas as pd
import numpy as np
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
from mlxtend.frequent_patterns.fpgrowth import fpgrowth
print("fpgrowth: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("fpgrowth(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
print("apriori: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
df1 = fpgrowth(items, min_support=0.01)
df2 = apriori(items, min_support=0.01)
df3 = pd.merge(df1, df2, left_on=['itemsets'], right_on=['itemsets'])
s1 = df3['support_x']
s2 = df3['support_y']
s1.name = 'support'
s2.name = 'support'
if not pd.testing.assert_series_equal(s1, s2): print("Assertion passed")
fpgrowth: 1069 transactions, 244 unique items
fastest run: 0.2869345769999967, slowest run: 0.36350778599999956, mean: 0.3085288325999983
apriori: 1069 transactions, 244 unique items
fastest run: 0.10744446700000765, slowest run: 0.11501072999999451, mean: 0.10995658089999694
Assertion passed
# With 0.01 min_support, fpgrowth is much faster
import pandas as pd
import numpy as np
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
from mlxtend.frequent_patterns.fpgrowth import fpgrowth
print("fpgrowth: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("fpgrowth(items, min_support=0.01)", globals=globals(), number=10, repeat=10)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
print("apriori: {} transactions, {} unique items".format(items.shape[0], items.shape[1]))
runs = repeat("apriori(items, min_support=0.01)", globals=globals(), number=10, repeat=10)
print("fastest run: {}, slowest run: {}, mean: {}".format(min(runs), max(runs), np.mean(runs)))
df1 = fpgrowth(items, min_support=0.01)
df2 = apriori(items, min_support=0.01)
df3 = pd.merge(df1, df2, left_on=['itemsets'], right_on=['itemsets'])
s1 = df3['support_x']
s2 = df3['support_y']
s1.name = 'support'
s2.name = 'support'
if not pd.testing.assert_series_equal(s1, s2): print("Assertion passed")
fpgrowth: 1069 transactions, 244 unique items
fastest run: 1.1011493270000017, slowest run: 1.1626929589999975, mean: 1.1198546620000003
apriori: 1069 transactions, 244 unique items
fastest run: 3.4599831980000033, slowest run: 3.8697552900000005, mean: 3.575044468499999
Assertion passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment