-
-
Save jmayse/ad688d6a7fd842269996a701d7cecd4c to your computer and use it in GitHub Desktop.
A priori profiling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Run 1, smaller dataset (1k tx, 244 items), current implementation | |
Line # Mem usage Increment Line Contents | |
================================================ | |
55 78.8 MiB 78.8 MiB @profile | |
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): | |
57 """Get frequent itemsets from a one-hot DataFrame | |
58 Parameters | |
59 ----------- | |
60 df : pandas DataFrame or pandas SparseDataFrame | |
61 pandas DataFrame the encoded format. | |
62 The allowed values are either 0/1 or True/False. | |
63 For example, | |
64 | |
65 ``` | |
66 Apple Bananas Beer Chicken Milk Rice | |
67 0 1 0 1 1 0 1 | |
68 1 1 0 1 0 0 1 | |
69 2 1 0 1 0 0 0 | |
70 3 1 1 0 0 0 0 | |
71 4 0 0 1 1 1 1 | |
72 5 0 0 1 0 1 1 | |
73 6 0 0 1 0 1 0 | |
74 7 1 1 0 0 0 0 | |
75 ``` | |
76 | |
77 min_support : float (default: 0.5) | |
78 A float between 0 and 1 for minumum support of the itemsets returned. | |
79 The support is computed as the fraction | |
80 transactions_where_item(s)_occur / total_transactions. | |
81 | |
82 use_colnames : bool (default: False) | |
83 If true, uses the DataFrames' column names in the returned DataFrame | |
84 instead of column indices. | |
85 | |
86 max_len : int (default: None) | |
87 Maximum length of the itemsets generated. If `None` (default) all | |
88 possible itemsets lengths (under the apriori condition) are evaluated. | |
89 | |
90 verbose : int (default: 0) | |
91 Shows the number of iterations if 1. | |
92 | |
93 Returns | |
94 ----------- | |
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets | |
96 that are >= `min_support` and < than `max_len` | |
97 (if `max_len` is not None). | |
98 Each itemset in the 'itemsets' column is of type `frozenset`, | |
99 which is a Python built-in type that behaves similarly to | |
100 sets except that it is immutable | |
101 (For more info, see | |
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset). | |
103 | |
104 Examples | |
105 ----------- | |
106 For usage examples, please see | |
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ | |
108 | |
109 """ | |
110 79.6 MiB 0.8 MiB idxs = np.where((df.values != 1) & (df.values != 0)) | |
111 79.6 MiB 0.0 MiB if len(idxs[0]) > 0: | |
112 val = df.values[idxs[0][0], idxs[1][0]] | |
113 s = ('The allowed values for a DataFrame' | |
114 ' are True, False, 0, 1. Found value %s' % (val)) | |
115 raise ValueError(s) | |
116 | |
117 79.6 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo") | |
118 79.6 MiB 0.0 MiB if is_sparse: | |
119 if not isinstance(df.columns[0], str) and df.columns[0] != 0: | |
120 raise ValueError('Due to current limitations in Pandas, ' | |
121 'if the SparseDataFrame has integer column names,' | |
122 'names, please make sure they either start ' | |
123 'with `0` or cast them as string column names: ' | |
124 '`df.columns = [str(i) for i in df.columns`].') | |
125 | |
126 X = df.to_coo().tocsc() | |
127 support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1) | |
128 else: | |
129 79.6 MiB 0.0 MiB X = df.values | |
130 79.6 MiB 0.0 MiB support = (np.sum(X, axis=0) / float(X.shape[0])) | |
131 | |
132 79.6 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1]) | |
133 79.6 MiB 0.0 MiB support_dict = {1: support[support >= min_support]} | |
134 79.6 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)} | |
135 79.6 MiB 0.0 MiB max_itemset = 1 | |
136 79.6 MiB 0.0 MiB rows_count = float(X.shape[0]) | |
137 | |
138 79.6 MiB 0.0 MiB if max_len is None: | |
139 79.6 MiB 0.0 MiB max_len = float('inf') | |
140 | |
141 79.6 MiB 0.0 MiB iter_count = 0 | |
142 | |
143 79.7 MiB 0.0 MiB while max_itemset and max_itemset < max_len: | |
144 79.7 MiB 0.0 MiB next_max_itemset = max_itemset + 1 | |
145 79.7 MiB 0.0 MiB combin = generate_new_combinations(itemset_dict[max_itemset]) | |
146 79.7 MiB 0.0 MiB frequent_items = [] | |
147 79.7 MiB 0.0 MiB frequent_items_support = [] | |
148 | |
149 79.7 MiB 0.0 MiB if is_sparse: | |
150 all_ones = np.ones((X.shape[0], next_max_itemset)) | |
151 79.7 MiB 0.0 MiB for c in combin: | |
152 79.7 MiB 0.0 MiB if verbose: | |
153 iter_count += 1 | |
154 print('\rIteration: %d | Sampling itemset size %d' % | |
155 (iter_count, next_max_itemset), end="") | |
156 79.7 MiB 0.0 MiB if is_sparse: | |
157 together = np.all(X[:, c] == all_ones, axis=1) | |
158 else: | |
159 79.7 MiB 0.0 MiB together = X[:, c].all(axis=1) | |
160 79.7 MiB 0.0 MiB support = together.sum() / rows_count | |
161 79.7 MiB 0.0 MiB if support >= min_support: | |
162 79.7 MiB 0.0 MiB frequent_items.append(c) | |
163 79.7 MiB 0.0 MiB frequent_items_support.append(support) | |
164 | |
165 79.7 MiB 0.0 MiB if frequent_items: | |
166 79.7 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(frequent_items) | |
167 79.7 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(frequent_items_support) | |
168 79.7 MiB 0.0 MiB max_itemset = next_max_itemset | |
169 else: | |
170 79.7 MiB 0.0 MiB max_itemset = 0 | |
171 | |
172 79.7 MiB 0.0 MiB all_res = [] | |
173 79.7 MiB 0.0 MiB for k in sorted(itemset_dict): | |
174 79.7 MiB 0.0 MiB support = pd.Series(support_dict[k]) | |
175 79.7 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]]) | |
176 | |
177 79.7 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1) | |
178 79.7 MiB 0.0 MiB all_res.append(res) | |
179 | |
180 79.8 MiB 0.0 MiB res_df = pd.concat(all_res) | |
181 79.8 MiB 0.0 MiB res_df.columns = ['support', 'itemsets'] | |
182 79.8 MiB 0.0 MiB if use_colnames: | |
183 mapping = {idx: item for idx, item in enumerate(df.columns)} | |
184 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([ | |
185 mapping[i] for i in x])) | |
186 79.8 MiB 0.0 MiB res_df = res_df.reset_index(drop=True) | |
187 | |
188 79.8 MiB 0.0 MiB if verbose: | |
189 print() # adds newline if verbose counter was used | |
190 | |
191 79.8 MiB 0.0 MiB return res_df | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Smaller dataset, current implementaton, dense dataframe | |
import pandas as pd | |
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t') | |
from mlxtend.preprocessing import TransactionEncoder | |
te = TransactionEncoder() | |
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x))) | |
items = pd.DataFrame(_items, columns=te.columns_) | |
from timeit import repeat | |
from mlxtend.frequent_patterns.apriori import apriori | |
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10) | |
Out[2]: | |
[0.3362377780000001, | |
0.34670980900000004, | |
0.3500013879999999, | |
0.3401237049999999, | |
0.34403185000000036, | |
0.34166972399999995, | |
0.3381037950000003, | |
0.34704037900000007, | |
0.35623237099999994, | |
0.35210514500000034] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run 2, larger dataset (8.6k tx, 966 items), current implementation | |
/usr/local/bin/python3.7 /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py | |
Filename: /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
55 95.7 MiB 95.7 MiB @profile | |
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): | |
57 """Get frequent itemsets from a one-hot DataFrame | |
58 Parameters | |
59 ----------- | |
60 df : pandas DataFrame or pandas SparseDataFrame | |
61 pandas DataFrame the encoded format. | |
62 The allowed values are either 0/1 or True/False. | |
63 For example, | |
64 | |
65 ``` | |
66 Apple Bananas Beer Chicken Milk Rice | |
67 0 1 0 1 1 0 1 | |
68 1 1 0 1 0 0 1 | |
69 2 1 0 1 0 0 0 | |
70 3 1 1 0 0 0 0 | |
71 4 0 0 1 1 1 1 | |
72 5 0 0 1 0 1 1 | |
73 6 0 0 1 0 1 0 | |
74 7 1 1 0 0 0 0 | |
75 ``` | |
76 | |
77 min_support : float (default: 0.5) | |
78 A float between 0 and 1 for minumum support of the itemsets returned. | |
79 The support is computed as the fraction | |
80 transactions_where_item(s)_occur / total_transactions. | |
81 | |
82 use_colnames : bool (default: False) | |
83 If true, uses the DataFrames' column names in the returned DataFrame | |
84 instead of column indices. | |
85 | |
86 max_len : int (default: None) | |
87 Maximum length of the itemsets generated. If `None` (default) all | |
88 possible itemsets lengths (under the apriori condition) are evaluated. | |
89 | |
90 verbose : int (default: 0) | |
91 Shows the number of iterations if 1. | |
92 | |
93 Returns | |
94 ----------- | |
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets | |
96 that are >= `min_support` and < than `max_len` | |
97 (if `max_len` is not None). | |
98 Each itemset in the 'itemsets' column is of type `frozenset`, | |
99 which is a Python built-in type that behaves similarly to | |
100 sets except that it is immutable | |
101 (For more info, see | |
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset). | |
103 | |
104 Examples | |
105 ----------- | |
106 For usage examples, please see | |
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ | |
108 | |
109 """ | |
110 112.4 MiB 16.7 MiB idxs = np.where((df.values != 1) & (df.values != 0)) | |
111 112.4 MiB 0.0 MiB if len(idxs[0]) > 0: | |
112 val = df.values[idxs[0][0], idxs[1][0]] | |
113 s = ('The allowed values for a DataFrame' | |
114 ' are True, False, 0, 1. Found value %s' % (val)) | |
115 raise ValueError(s) | |
116 | |
117 112.4 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo") | |
118 112.4 MiB 0.0 MiB if is_sparse: | |
119 if not isinstance(df.columns[0], str) and df.columns[0] != 0: | |
120 raise ValueError('Due to current limitations in Pandas, ' | |
121 'if the SparseDataFrame has integer column names,' | |
122 'names, please make sure they either start ' | |
123 'with `0` or cast them as string column names: ' | |
124 '`df.columns = [str(i) for i in df.columns`].') | |
125 | |
126 X = df.to_coo().tocsc() | |
127 support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1) | |
128 else: | |
129 112.4 MiB 0.0 MiB X = df.values | |
130 112.4 MiB 0.0 MiB support = (np.sum(X, axis=0) / float(X.shape[0])) | |
131 | |
132 112.4 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1]) | |
133 112.4 MiB 0.0 MiB support_dict = {1: support[support >= min_support]} | |
134 112.4 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)} | |
135 112.4 MiB 0.0 MiB max_itemset = 1 | |
136 112.4 MiB 0.0 MiB rows_count = float(X.shape[0]) | |
137 | |
138 112.4 MiB 0.0 MiB if max_len is None: | |
139 112.4 MiB 0.0 MiB max_len = float('inf') | |
140 | |
141 112.4 MiB 0.0 MiB iter_count = 0 | |
142 | |
143 112.5 MiB 0.0 MiB while max_itemset and max_itemset < max_len: | |
144 112.5 MiB 0.0 MiB next_max_itemset = max_itemset + 1 | |
145 112.5 MiB 0.0 MiB combin = generate_new_combinations(itemset_dict[max_itemset]) | |
146 112.5 MiB 0.0 MiB frequent_items = [] | |
147 112.5 MiB 0.0 MiB frequent_items_support = [] | |
148 | |
149 112.5 MiB 0.0 MiB if is_sparse: | |
150 all_ones = np.ones((X.shape[0], next_max_itemset)) | |
151 112.5 MiB 0.0 MiB for c in combin: | |
152 112.5 MiB 0.0 MiB if verbose: | |
153 iter_count += 1 | |
154 print('\rIteration: %d | Sampling itemset size %d' % | |
155 (iter_count, next_max_itemset), end="") | |
156 112.5 MiB 0.0 MiB if is_sparse: | |
157 together = np.all(X[:, c] == all_ones, axis=1) | |
158 else: | |
159 112.5 MiB 0.0 MiB together = X[:, c].all(axis=1) | |
160 112.5 MiB 0.0 MiB support = together.sum() / rows_count | |
161 112.5 MiB 0.0 MiB if support >= min_support: | |
162 112.5 MiB 0.0 MiB frequent_items.append(c) | |
163 112.5 MiB 0.0 MiB frequent_items_support.append(support) | |
164 | |
165 112.5 MiB 0.0 MiB if frequent_items: | |
166 112.5 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(frequent_items) | |
167 112.5 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(frequent_items_support) | |
168 112.5 MiB 0.0 MiB max_itemset = next_max_itemset | |
169 else: | |
170 112.5 MiB 0.0 MiB max_itemset = 0 | |
171 | |
172 112.5 MiB 0.0 MiB all_res = [] | |
173 112.5 MiB 0.0 MiB for k in sorted(itemset_dict): | |
174 112.5 MiB 0.0 MiB support = pd.Series(support_dict[k]) | |
175 112.5 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]]) | |
176 | |
177 112.5 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1) | |
178 112.5 MiB 0.0 MiB all_res.append(res) | |
179 | |
180 112.6 MiB 0.0 MiB res_df = pd.concat(all_res) | |
181 112.6 MiB 0.0 MiB res_df.columns = ['support', 'itemsets'] | |
182 112.6 MiB 0.0 MiB if use_colnames: | |
183 mapping = {idx: item for idx, item in enumerate(df.columns)} | |
184 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([ | |
185 mapping[i] for i in x])) | |
186 112.6 MiB 0.0 MiB res_df = res_df.reset_index(drop=True) | |
187 | |
188 112.6 MiB 0.0 MiB if verbose: | |
189 print() # adds newline if verbose counter was used | |
190 | |
191 112.6 MiB 0.0 MiB return res_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Smaller dataset, current implementation, sparse dataframe | |
import pandas as pd | |
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t') | |
from mlxtend.preprocessing import TransactionEncoder | |
te = TransactionEncoder() | |
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x))) | |
items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0) | |
from timeit import repeat | |
from mlxtend.frequent_patterns.apriori import apriori | |
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10) | |
Out[2]: | |
[3.3068226319999994, | |
3.2766413269999983, | |
3.3034355029999993, | |
3.4978498950000017, | |
3.502342746, | |
3.5774782919999986, | |
3.6397878360000036, | |
3.361885745000002, | |
3.2973721370000035, | |
3.293594581999997] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run 3, small dataset, new implementation | |
Line # Mem usage Increment Line Contents | |
================================================ | |
55 78.8 MiB 78.8 MiB @profile | |
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): | |
57 """Get frequent itemsets from a one-hot DataFrame | |
58 Parameters | |
59 ----------- | |
60 df : pandas DataFrame or pandas SparseDataFrame | |
61 pandas DataFrame the encoded format. | |
62 The allowed values are either 0/1 or True/False. | |
63 For example, | |
64 | |
65 ``` | |
66 Apple Bananas Beer Chicken Milk Rice | |
67 0 1 0 1 1 0 1 | |
68 1 1 0 1 0 0 1 | |
69 2 1 0 1 0 0 0 | |
70 3 1 1 0 0 0 0 | |
71 4 0 0 1 1 1 1 | |
72 5 0 0 1 0 1 1 | |
73 6 0 0 1 0 1 0 | |
74 7 1 1 0 0 0 0 | |
75 ``` | |
76 | |
77 min_support : float (default: 0.5) | |
78 A float between 0 and 1 for minumum support of the itemsets returned. | |
79 The support is computed as the fraction | |
80 transactions_where_item(s)_occur / total_transactions. | |
81 | |
82 use_colnames : bool (default: False) | |
83 If true, uses the DataFrames' column names in the returned DataFrame | |
84 instead of column indices. | |
85 | |
86 max_len : int (default: None) | |
87 Maximum length of the itemsets generated. If `None` (default) all | |
88 possible itemsets lengths (under the apriori condition) are evaluated. | |
89 | |
90 verbose : int (default: 0) | |
91 Shows the number of iterations if 1. | |
92 | |
93 Returns | |
94 ----------- | |
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets | |
96 that are >= `min_support` and < than `max_len` | |
97 (if `max_len` is not None). | |
98 Each itemset in the 'itemsets' column is of type `frozenset`, | |
99 which is a Python built-in type that behaves similarly to | |
100 sets except that it is immutable | |
101 (For more info, see | |
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset). | |
103 | |
104 Examples | |
105 ----------- | |
106 For usage examples, please see | |
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ | |
108 | |
109 """ | |
110 | |
111 84.7 MiB 0.9 MiB def _support(_x, _n_rows, _is_sparse): | |
112 """ | |
113 DRY local method to calculate support as the row-wise sum of values / number of rows | |
114 :param _x: matrix of bools or binary | |
115 :param _n_rows: numeric, number of rows | |
116 :param _is_sparse: bool True if _x is sparse | |
117 :return: np.array, shape = (n_rows, ) | |
118 """ | |
119 84.7 MiB 0.0 MiB out = (np.sum(_x, axis=0) / _n_rows) | |
120 84.7 MiB 0.0 MiB return np.array(out).reshape(-1) | |
121 | |
122 79.6 MiB 0.8 MiB idxs = np.where((df.values != 1) & (df.values != 0)) | |
123 79.6 MiB 0.0 MiB if len(idxs[0]) > 0: | |
124 val = df.values[idxs[0][0], idxs[1][0]] | |
125 s = ('The allowed values for a DataFrame' | |
126 ' are True, False, 0, 1. Found value %s' % (val)) | |
127 raise ValueError(s) | |
128 | |
129 79.6 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo") | |
130 79.6 MiB 0.0 MiB if is_sparse: | |
131 if not isinstance(df.columns[0], str) and df.columns[0] != 0: | |
132 raise ValueError('Due to current limitations in Pandas, ' | |
133 'if the SparseDataFrame has integer column names,' | |
134 'names, please make sure they either start ' | |
135 'with `0` or cast them as string column names: ' | |
136 '`df.columns = [str(i) for i in df.columns`].') | |
137 | |
138 X = df.to_coo().tocsc() | |
139 support = _support(X, X.shape[0], is_sparse) | |
140 else: | |
141 79.6 MiB 0.0 MiB X = df.values | |
142 79.6 MiB 0.0 MiB support = _support(X, X.shape[0], is_sparse) | |
143 | |
144 79.6 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1]) | |
145 79.6 MiB 0.0 MiB support_dict = {1: support[support >= min_support]} | |
146 79.6 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)} | |
147 79.6 MiB 0.0 MiB max_itemset = 1 | |
148 79.6 MiB 0.0 MiB rows_count = float(X.shape[0]) | |
149 | |
150 84.2 MiB 0.0 MiB while max_itemset and max_itemset < (max_len or float('inf')): | |
151 84.2 MiB 0.0 MiB next_max_itemset = max_itemset + 1 | |
152 84.2 MiB 0.0 MiB combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset]))) | |
153 | |
154 84.2 MiB 0.0 MiB if combin.size == 0: | |
155 break | |
156 | |
157 84.2 MiB 0.0 MiB if is_sparse: | |
158 all_ones = np.ones((int(rows_count), 1)) | |
159 _bools = X[:, combin[:, 0]] == all_ones | |
160 for n in range(1, combin.shape[1]): | |
161 _bools = _bools & (X[:, combin[:, n]] == all_ones) | |
162 else: | |
163 84.2 MiB 2.2 MiB _bools = np.all(X[:, combin], axis=2) | |
164 | |
165 84.2 MiB 0.0 MiB support = _support(np.array(_bools), rows_count, is_sparse) | |
166 84.2 MiB 0.0 MiB _mask = (support >= min_support).reshape(-1) | |
167 | |
168 84.2 MiB 0.0 MiB if any(_mask): | |
169 84.2 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(combin[_mask]) | |
170 84.2 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(support[_mask]) | |
171 84.2 MiB 0.0 MiB max_itemset = next_max_itemset | |
172 else: | |
173 83.8 MiB 0.0 MiB break | |
174 83.8 MiB 0.0 MiB all_res = [] | |
175 83.8 MiB 0.0 MiB for k in sorted(itemset_dict): | |
176 83.8 MiB 0.0 MiB support = pd.Series(support_dict[k]) | |
177 83.8 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]]) | |
178 | |
179 83.8 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1) | |
180 83.8 MiB 0.0 MiB all_res.append(res) | |
181 | |
182 83.9 MiB 0.0 MiB res_df = pd.concat(all_res) | |
183 83.9 MiB 0.0 MiB res_df.columns = ['support', 'itemsets'] | |
184 83.9 MiB 0.0 MiB if use_colnames: | |
185 mapping = {idx: item for idx, item in enumerate(df.columns)} | |
186 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([ | |
187 mapping[i] for i in x])) | |
188 83.9 MiB 0.0 MiB res_df = res_df.reset_index(drop=True) | |
189 | |
190 83.9 MiB 0.0 MiB if verbose: | |
191 print() # adds newline if verbose counter was used | |
192 | |
193 83.9 MiB 0.0 MiB return res_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Smaller dataset, new implementation, dense dataframe | |
import pandas as pd | |
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t') | |
from mlxtend.preprocessing import TransactionEncoder | |
te = TransactionEncoder() | |
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x))) | |
items = pd.DataFrame(_items, columns=te.columns_) | |
from timeit import repeat | |
from mlxtend.frequent_patterns.apriori import apriori | |
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10) | |
Out[2]: | |
[0.12199777899999997, | |
0.11085488300000002, | |
0.12048885899999995, | |
0.12400905900000003, | |
0.1155322019999998, | |
0.11182724500000019, | |
0.12058627099999963, | |
0.11633145700000025, | |
0.11280813100000042, | |
0.11844944400000035] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Smaller dataset, new implementation, sparse dataframe | |
import pandas as pd | |
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t') | |
from mlxtend.preprocessing import TransactionEncoder | |
te = TransactionEncoder() | |
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x))) | |
items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0) | |
from timeit import repeat | |
from mlxtend.frequent_patterns.apriori import apriori | |
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10) | |
Out[2]: | |
[0.5350212700000001, | |
0.5399577099999999, | |
0.5390251770000001, | |
0.5814595669999996, | |
0.5923116000000004, | |
0.5686850980000004, | |
0.5839204499999999, | |
0.4983124719999994, | |
0.5384785540000001, | |
0.5440615109999998] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run 4, larger dataset, new implementation | |
Line # Mem usage Increment Line Contents | |
================================================ | |
55 95.7 MiB 95.7 MiB @profile | |
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): | |
57 """Get frequent itemsets from a one-hot DataFrame | |
58 Parameters | |
59 ----------- | |
60 df : pandas DataFrame or pandas SparseDataFrame | |
61 pandas DataFrame the encoded format. | |
62 The allowed values are either 0/1 or True/False. | |
63 For example, | |
64 | |
65 ``` | |
66 Apple Bananas Beer Chicken Milk Rice | |
67 0 1 0 1 1 0 1 | |
68 1 1 0 1 0 0 1 | |
69 2 1 0 1 0 0 0 | |
70 3 1 1 0 0 0 0 | |
71 4 0 0 1 1 1 1 | |
72 5 0 0 1 0 1 1 | |
73 6 0 0 1 0 1 0 | |
74 7 1 1 0 0 0 0 | |
75 ``` | |
76 | |
77 min_support : float (default: 0.5) | |
78 A float between 0 and 1 for minumum support of the itemsets returned. | |
79 The support is computed as the fraction | |
80 transactions_where_item(s)_occur / total_transactions. | |
81 | |
82 use_colnames : bool (default: False) | |
83 If true, uses the DataFrames' column names in the returned DataFrame | |
84 instead of column indices. | |
85 | |
86 max_len : int (default: None) | |
87 Maximum length of the itemsets generated. If `None` (default) all | |
88 possible itemsets lengths (under the apriori condition) are evaluated. | |
89 | |
90 verbose : int (default: 0) | |
91 Shows the number of iterations if 1. | |
92 | |
93 Returns | |
94 ----------- | |
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets | |
96 that are >= `min_support` and < than `max_len` | |
97 (if `max_len` is not None). | |
98 Each itemset in the 'itemsets' column is of type `frozenset`, | |
99 which is a Python built-in type that behaves similarly to | |
100 sets except that it is immutable | |
101 (For more info, see | |
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset). | |
103 | |
104 Examples | |
105 ----------- | |
106 For usage examples, please see | |
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ | |
108 | |
109 """ | |
110 | |
111 118.2 MiB 2.9 MiB def _support(_x, _n_rows, _is_sparse): | |
112 """ | |
113 DRY local method to calculate support as the row-wise sum of values / number of rows | |
114 :param _x: matrix of bools or binary | |
115 :param _n_rows: numeric, number of rows | |
116 :param _is_sparse: bool True if _x is sparse | |
117 :return: np.array, shape = (n_rows, ) | |
118 """ | |
119 118.2 MiB 0.0 MiB out = (np.sum(_x, axis=0) / _n_rows) | |
120 118.2 MiB 0.0 MiB return np.array(out).reshape(-1) | |
121 | |
122 112.4 MiB 16.7 MiB idxs = np.where((df.values != 1) & (df.values != 0)) | |
123 112.4 MiB 0.0 MiB if len(idxs[0]) > 0: | |
124 val = df.values[idxs[0][0], idxs[1][0]] | |
125 s = ('The allowed values for a DataFrame' | |
126 ' are True, False, 0, 1. Found value %s' % (val)) | |
127 raise ValueError(s) | |
128 | |
129 112.4 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo") | |
130 112.4 MiB 0.0 MiB if is_sparse: | |
131 if not isinstance(df.columns[0], str) and df.columns[0] != 0: | |
132 raise ValueError('Due to current limitations in Pandas, ' | |
133 'if the SparseDataFrame has integer column names,' | |
134 'names, please make sure they either start ' | |
135 'with `0` or cast them as string column names: ' | |
136 '`df.columns = [str(i) for i in df.columns`].') | |
137 | |
138 X = df.to_coo().tocsc() | |
139 support = _support(X, X.shape[0], is_sparse) | |
140 else: | |
141 112.4 MiB 0.0 MiB X = df.values | |
142 112.4 MiB 0.0 MiB support = _support(X, X.shape[0], is_sparse) | |
143 | |
144 112.4 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1]) | |
145 112.4 MiB 0.0 MiB support_dict = {1: support[support >= min_support]} | |
146 112.4 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)} | |
147 112.4 MiB 0.0 MiB max_itemset = 1 | |
148 112.4 MiB 0.0 MiB rows_count = float(X.shape[0]) | |
149 | |
150 117.7 MiB 0.0 MiB while max_itemset and max_itemset < (max_len or float('inf')): | |
151 117.7 MiB 0.0 MiB next_max_itemset = max_itemset + 1 | |
152 117.7 MiB 0.0 MiB combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset]))) | |
153 | |
154 117.7 MiB 0.0 MiB if combin.size == 0: | |
155 break | |
156 | |
157 117.7 MiB 0.0 MiB if is_sparse: | |
158 all_ones = np.ones((int(rows_count), 1)) | |
159 _bools = X[:, combin[:, 0]] == all_ones | |
160 for n in range(1, combin.shape[1]): | |
161 _bools = _bools & (X[:, combin[:, n]] == all_ones) | |
162 else: | |
163 117.8 MiB 2.9 MiB _bools = np.all(X[:, combin], axis=2) | |
164 | |
165 117.8 MiB 0.0 MiB support = _support(np.array(_bools), rows_count, is_sparse) | |
166 117.8 MiB 0.0 MiB _mask = (support >= min_support).reshape(-1) | |
167 | |
168 117.8 MiB 0.0 MiB if any(_mask): | |
169 117.7 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(combin[_mask]) | |
170 117.7 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(support[_mask]) | |
171 117.7 MiB 0.0 MiB max_itemset = next_max_itemset | |
172 else: | |
173 117.8 MiB 0.0 MiB break | |
174 117.8 MiB 0.0 MiB all_res = [] | |
175 117.9 MiB 0.0 MiB for k in sorted(itemset_dict): | |
176 117.9 MiB 0.0 MiB support = pd.Series(support_dict[k]) | |
177 117.9 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]]) | |
178 | |
179 117.9 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1) | |
180 117.9 MiB 0.0 MiB all_res.append(res) | |
181 | |
182 117.9 MiB 0.0 MiB res_df = pd.concat(all_res) | |
183 117.9 MiB 0.0 MiB res_df.columns = ['support', 'itemsets'] | |
184 117.9 MiB 0.0 MiB if use_colnames: | |
185 mapping = {idx: item for idx, item in enumerate(df.columns)} | |
186 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([ | |
187 mapping[i] for i in x])) | |
188 117.9 MiB 0.0 MiB res_df = res_df.reset_index(drop=True) | |
189 | |
190 117.9 MiB 0.0 MiB if verbose: | |
191 print() # adds newline if verbose counter was used | |
192 | |
193 117.9 MiB 0.0 MiB return res_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment