Skip to content

Instantly share code, notes, and snippets.

@jmayse
Last active July 18, 2019 14:02
Show Gist options
  • Save jmayse/ad688d6a7fd842269996a701d7cecd4c to your computer and use it in GitHub Desktop.
Save jmayse/ad688d6a7fd842269996a701d7cecd4c to your computer and use it in GitHub Desktop.
A priori profiling
## Run 1, smaller dataset (1k tx, 244 items), current implementation
Line # Mem usage Increment Line Contents
================================================
55 78.8 MiB 78.8 MiB @profile
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
57 """Get frequent itemsets from a one-hot DataFrame
58 Parameters
59 -----------
60 df : pandas DataFrame or pandas SparseDataFrame
61 pandas DataFrame the encoded format.
62 The allowed values are either 0/1 or True/False.
63 For example,
64
65 ```
66 Apple Bananas Beer Chicken Milk Rice
67 0 1 0 1 1 0 1
68 1 1 0 1 0 0 1
69 2 1 0 1 0 0 0
70 3 1 1 0 0 0 0
71 4 0 0 1 1 1 1
72 5 0 0 1 0 1 1
73 6 0 0 1 0 1 0
74 7 1 1 0 0 0 0
75 ```
76
77 min_support : float (default: 0.5)
78 A float between 0 and 1 for minumum support of the itemsets returned.
79 The support is computed as the fraction
80 transactions_where_item(s)_occur / total_transactions.
81
82 use_colnames : bool (default: False)
83 If true, uses the DataFrames' column names in the returned DataFrame
84 instead of column indices.
85
86 max_len : int (default: None)
87 Maximum length of the itemsets generated. If `None` (default) all
88 possible itemsets lengths (under the apriori condition) are evaluated.
89
90 verbose : int (default: 0)
91 Shows the number of iterations if 1.
92
93 Returns
94 -----------
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
96 that are >= `min_support` and < than `max_len`
97 (if `max_len` is not None).
98 Each itemset in the 'itemsets' column is of type `frozenset`,
99 which is a Python built-in type that behaves similarly to
100 sets except that it is immutable
101 (For more info, see
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
103
104 Examples
105 -----------
106 For usage examples, please see
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
108
109 """
110 79.6 MiB 0.8 MiB idxs = np.where((df.values != 1) & (df.values != 0))
111 79.6 MiB 0.0 MiB if len(idxs[0]) > 0:
112 val = df.values[idxs[0][0], idxs[1][0]]
113 s = ('The allowed values for a DataFrame'
114 ' are True, False, 0, 1. Found value %s' % (val))
115 raise ValueError(s)
116
117 79.6 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
118 79.6 MiB 0.0 MiB if is_sparse:
119 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
120 raise ValueError('Due to current limitations in Pandas, '
121 'if the SparseDataFrame has integer column names,'
122 'names, please make sure they either start '
123 'with `0` or cast them as string column names: '
124 '`df.columns = [str(i) for i in df.columns`].')
125
126 X = df.to_coo().tocsc()
127 support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1)
128 else:
129 79.6 MiB 0.0 MiB X = df.values
130 79.6 MiB 0.0 MiB support = (np.sum(X, axis=0) / float(X.shape[0]))
131
132 79.6 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
133 79.6 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
134 79.6 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
135 79.6 MiB 0.0 MiB max_itemset = 1
136 79.6 MiB 0.0 MiB rows_count = float(X.shape[0])
137
138 79.6 MiB 0.0 MiB if max_len is None:
139 79.6 MiB 0.0 MiB max_len = float('inf')
140
141 79.6 MiB 0.0 MiB iter_count = 0
142
143 79.7 MiB 0.0 MiB while max_itemset and max_itemset < max_len:
144 79.7 MiB 0.0 MiB next_max_itemset = max_itemset + 1
145 79.7 MiB 0.0 MiB combin = generate_new_combinations(itemset_dict[max_itemset])
146 79.7 MiB 0.0 MiB frequent_items = []
147 79.7 MiB 0.0 MiB frequent_items_support = []
148
149 79.7 MiB 0.0 MiB if is_sparse:
150 all_ones = np.ones((X.shape[0], next_max_itemset))
151 79.7 MiB 0.0 MiB for c in combin:
152 79.7 MiB 0.0 MiB if verbose:
153 iter_count += 1
154 print('\rIteration: %d | Sampling itemset size %d' %
155 (iter_count, next_max_itemset), end="")
156 79.7 MiB 0.0 MiB if is_sparse:
157 together = np.all(X[:, c] == all_ones, axis=1)
158 else:
159 79.7 MiB 0.0 MiB together = X[:, c].all(axis=1)
160 79.7 MiB 0.0 MiB support = together.sum() / rows_count
161 79.7 MiB 0.0 MiB if support >= min_support:
162 79.7 MiB 0.0 MiB frequent_items.append(c)
163 79.7 MiB 0.0 MiB frequent_items_support.append(support)
164
165 79.7 MiB 0.0 MiB if frequent_items:
166 79.7 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(frequent_items)
167 79.7 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(frequent_items_support)
168 79.7 MiB 0.0 MiB max_itemset = next_max_itemset
169 else:
170 79.7 MiB 0.0 MiB max_itemset = 0
171
172 79.7 MiB 0.0 MiB all_res = []
173 79.7 MiB 0.0 MiB for k in sorted(itemset_dict):
174 79.7 MiB 0.0 MiB support = pd.Series(support_dict[k])
175 79.7 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
176
177 79.7 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
178 79.7 MiB 0.0 MiB all_res.append(res)
179
180 79.8 MiB 0.0 MiB res_df = pd.concat(all_res)
181 79.8 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
182 79.8 MiB 0.0 MiB if use_colnames:
183 mapping = {idx: item for idx, item in enumerate(df.columns)}
184 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
185 mapping[i] for i in x]))
186 79.8 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
187
188 79.8 MiB 0.0 MiB if verbose:
189 print() # adds newline if verbose counter was used
190
191 79.8 MiB 0.0 MiB return res_df
# Smaller dataset, current implementaton, dense dataframe
import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[0.3362377780000001,
0.34670980900000004,
0.3500013879999999,
0.3401237049999999,
0.34403185000000036,
0.34166972399999995,
0.3381037950000003,
0.34704037900000007,
0.35623237099999994,
0.35210514500000034]
# Run 2, larger dataset (8.6k tx, 966 items), current implementation
/usr/local/bin/python3.7 /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py
Filename: /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py
Line # Mem usage Increment Line Contents
================================================
55 95.7 MiB 95.7 MiB @profile
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
57 """Get frequent itemsets from a one-hot DataFrame
58 Parameters
59 -----------
60 df : pandas DataFrame or pandas SparseDataFrame
61 pandas DataFrame the encoded format.
62 The allowed values are either 0/1 or True/False.
63 For example,
64
65 ```
66 Apple Bananas Beer Chicken Milk Rice
67 0 1 0 1 1 0 1
68 1 1 0 1 0 0 1
69 2 1 0 1 0 0 0
70 3 1 1 0 0 0 0
71 4 0 0 1 1 1 1
72 5 0 0 1 0 1 1
73 6 0 0 1 0 1 0
74 7 1 1 0 0 0 0
75 ```
76
77 min_support : float (default: 0.5)
78 A float between 0 and 1 for minumum support of the itemsets returned.
79 The support is computed as the fraction
80 transactions_where_item(s)_occur / total_transactions.
81
82 use_colnames : bool (default: False)
83 If true, uses the DataFrames' column names in the returned DataFrame
84 instead of column indices.
85
86 max_len : int (default: None)
87 Maximum length of the itemsets generated. If `None` (default) all
88 possible itemsets lengths (under the apriori condition) are evaluated.
89
90 verbose : int (default: 0)
91 Shows the number of iterations if 1.
92
93 Returns
94 -----------
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
96 that are >= `min_support` and < than `max_len`
97 (if `max_len` is not None).
98 Each itemset in the 'itemsets' column is of type `frozenset`,
99 which is a Python built-in type that behaves similarly to
100 sets except that it is immutable
101 (For more info, see
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
103
104 Examples
105 -----------
106 For usage examples, please see
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
108
109 """
110 112.4 MiB 16.7 MiB idxs = np.where((df.values != 1) & (df.values != 0))
111 112.4 MiB 0.0 MiB if len(idxs[0]) > 0:
112 val = df.values[idxs[0][0], idxs[1][0]]
113 s = ('The allowed values for a DataFrame'
114 ' are True, False, 0, 1. Found value %s' % (val))
115 raise ValueError(s)
116
117 112.4 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
118 112.4 MiB 0.0 MiB if is_sparse:
119 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
120 raise ValueError('Due to current limitations in Pandas, '
121 'if the SparseDataFrame has integer column names,'
122 'names, please make sure they either start '
123 'with `0` or cast them as string column names: '
124 '`df.columns = [str(i) for i in df.columns`].')
125
126 X = df.to_coo().tocsc()
127 support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1)
128 else:
129 112.4 MiB 0.0 MiB X = df.values
130 112.4 MiB 0.0 MiB support = (np.sum(X, axis=0) / float(X.shape[0]))
131
132 112.4 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
133 112.4 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
134 112.4 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
135 112.4 MiB 0.0 MiB max_itemset = 1
136 112.4 MiB 0.0 MiB rows_count = float(X.shape[0])
137
138 112.4 MiB 0.0 MiB if max_len is None:
139 112.4 MiB 0.0 MiB max_len = float('inf')
140
141 112.4 MiB 0.0 MiB iter_count = 0
142
143 112.5 MiB 0.0 MiB while max_itemset and max_itemset < max_len:
144 112.5 MiB 0.0 MiB next_max_itemset = max_itemset + 1
145 112.5 MiB 0.0 MiB combin = generate_new_combinations(itemset_dict[max_itemset])
146 112.5 MiB 0.0 MiB frequent_items = []
147 112.5 MiB 0.0 MiB frequent_items_support = []
148
149 112.5 MiB 0.0 MiB if is_sparse:
150 all_ones = np.ones((X.shape[0], next_max_itemset))
151 112.5 MiB 0.0 MiB for c in combin:
152 112.5 MiB 0.0 MiB if verbose:
153 iter_count += 1
154 print('\rIteration: %d | Sampling itemset size %d' %
155 (iter_count, next_max_itemset), end="")
156 112.5 MiB 0.0 MiB if is_sparse:
157 together = np.all(X[:, c] == all_ones, axis=1)
158 else:
159 112.5 MiB 0.0 MiB together = X[:, c].all(axis=1)
160 112.5 MiB 0.0 MiB support = together.sum() / rows_count
161 112.5 MiB 0.0 MiB if support >= min_support:
162 112.5 MiB 0.0 MiB frequent_items.append(c)
163 112.5 MiB 0.0 MiB frequent_items_support.append(support)
164
165 112.5 MiB 0.0 MiB if frequent_items:
166 112.5 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(frequent_items)
167 112.5 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(frequent_items_support)
168 112.5 MiB 0.0 MiB max_itemset = next_max_itemset
169 else:
170 112.5 MiB 0.0 MiB max_itemset = 0
171
172 112.5 MiB 0.0 MiB all_res = []
173 112.5 MiB 0.0 MiB for k in sorted(itemset_dict):
174 112.5 MiB 0.0 MiB support = pd.Series(support_dict[k])
175 112.5 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
176
177 112.5 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
178 112.5 MiB 0.0 MiB all_res.append(res)
179
180 112.6 MiB 0.0 MiB res_df = pd.concat(all_res)
181 112.6 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
182 112.6 MiB 0.0 MiB if use_colnames:
183 mapping = {idx: item for idx, item in enumerate(df.columns)}
184 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
185 mapping[i] for i in x]))
186 112.6 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
187
188 112.6 MiB 0.0 MiB if verbose:
189 print() # adds newline if verbose counter was used
190
191 112.6 MiB 0.0 MiB return res_df
# Smaller dataset, current implementation, sparse dataframe
import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[3.3068226319999994,
3.2766413269999983,
3.3034355029999993,
3.4978498950000017,
3.502342746,
3.5774782919999986,
3.6397878360000036,
3.361885745000002,
3.2973721370000035,
3.293594581999997]
# Run 3, small dataset, new implementation
Line # Mem usage Increment Line Contents
================================================
55 78.8 MiB 78.8 MiB @profile
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
57 """Get frequent itemsets from a one-hot DataFrame
58 Parameters
59 -----------
60 df : pandas DataFrame or pandas SparseDataFrame
61 pandas DataFrame the encoded format.
62 The allowed values are either 0/1 or True/False.
63 For example,
64
65 ```
66 Apple Bananas Beer Chicken Milk Rice
67 0 1 0 1 1 0 1
68 1 1 0 1 0 0 1
69 2 1 0 1 0 0 0
70 3 1 1 0 0 0 0
71 4 0 0 1 1 1 1
72 5 0 0 1 0 1 1
73 6 0 0 1 0 1 0
74 7 1 1 0 0 0 0
75 ```
76
77 min_support : float (default: 0.5)
78 A float between 0 and 1 for minumum support of the itemsets returned.
79 The support is computed as the fraction
80 transactions_where_item(s)_occur / total_transactions.
81
82 use_colnames : bool (default: False)
83 If true, uses the DataFrames' column names in the returned DataFrame
84 instead of column indices.
85
86 max_len : int (default: None)
87 Maximum length of the itemsets generated. If `None` (default) all
88 possible itemsets lengths (under the apriori condition) are evaluated.
89
90 verbose : int (default: 0)
91 Shows the number of iterations if 1.
92
93 Returns
94 -----------
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
96 that are >= `min_support` and < than `max_len`
97 (if `max_len` is not None).
98 Each itemset in the 'itemsets' column is of type `frozenset`,
99 which is a Python built-in type that behaves similarly to
100 sets except that it is immutable
101 (For more info, see
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
103
104 Examples
105 -----------
106 For usage examples, please see
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
108
109 """
110
111 84.7 MiB 0.9 MiB def _support(_x, _n_rows, _is_sparse):
112 """
113 DRY local method to calculate support as the row-wise sum of values / number of rows
114 :param _x: matrix of bools or binary
115 :param _n_rows: numeric, number of rows
116 :param _is_sparse: bool True if _x is sparse
117 :return: np.array, shape = (n_rows, )
118 """
119 84.7 MiB 0.0 MiB out = (np.sum(_x, axis=0) / _n_rows)
120 84.7 MiB 0.0 MiB return np.array(out).reshape(-1)
121
122 79.6 MiB 0.8 MiB idxs = np.where((df.values != 1) & (df.values != 0))
123 79.6 MiB 0.0 MiB if len(idxs[0]) > 0:
124 val = df.values[idxs[0][0], idxs[1][0]]
125 s = ('The allowed values for a DataFrame'
126 ' are True, False, 0, 1. Found value %s' % (val))
127 raise ValueError(s)
128
129 79.6 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
130 79.6 MiB 0.0 MiB if is_sparse:
131 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
132 raise ValueError('Due to current limitations in Pandas, '
133 'if the SparseDataFrame has integer column names,'
134 'names, please make sure they either start '
135 'with `0` or cast them as string column names: '
136 '`df.columns = [str(i) for i in df.columns`].')
137
138 X = df.to_coo().tocsc()
139 support = _support(X, X.shape[0], is_sparse)
140 else:
141 79.6 MiB 0.0 MiB X = df.values
142 79.6 MiB 0.0 MiB support = _support(X, X.shape[0], is_sparse)
143
144 79.6 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
145 79.6 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
146 79.6 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
147 79.6 MiB 0.0 MiB max_itemset = 1
148 79.6 MiB 0.0 MiB rows_count = float(X.shape[0])
149
150 84.2 MiB 0.0 MiB while max_itemset and max_itemset < (max_len or float('inf')):
151 84.2 MiB 0.0 MiB next_max_itemset = max_itemset + 1
152 84.2 MiB 0.0 MiB combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset])))
153
154 84.2 MiB 0.0 MiB if combin.size == 0:
155 break
156
157 84.2 MiB 0.0 MiB if is_sparse:
158 all_ones = np.ones((int(rows_count), 1))
159 _bools = X[:, combin[:, 0]] == all_ones
160 for n in range(1, combin.shape[1]):
161 _bools = _bools & (X[:, combin[:, n]] == all_ones)
162 else:
163 84.2 MiB 2.2 MiB _bools = np.all(X[:, combin], axis=2)
164
165 84.2 MiB 0.0 MiB support = _support(np.array(_bools), rows_count, is_sparse)
166 84.2 MiB 0.0 MiB _mask = (support >= min_support).reshape(-1)
167
168 84.2 MiB 0.0 MiB if any(_mask):
169 84.2 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(combin[_mask])
170 84.2 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(support[_mask])
171 84.2 MiB 0.0 MiB max_itemset = next_max_itemset
172 else:
173 83.8 MiB 0.0 MiB break
174 83.8 MiB 0.0 MiB all_res = []
175 83.8 MiB 0.0 MiB for k in sorted(itemset_dict):
176 83.8 MiB 0.0 MiB support = pd.Series(support_dict[k])
177 83.8 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
178
179 83.8 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
180 83.8 MiB 0.0 MiB all_res.append(res)
181
182 83.9 MiB 0.0 MiB res_df = pd.concat(all_res)
183 83.9 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
184 83.9 MiB 0.0 MiB if use_colnames:
185 mapping = {idx: item for idx, item in enumerate(df.columns)}
186 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
187 mapping[i] for i in x]))
188 83.9 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
189
190 83.9 MiB 0.0 MiB if verbose:
191 print() # adds newline if verbose counter was used
192
193 83.9 MiB 0.0 MiB return res_df
# Smaller dataset, new implementation, dense dataframe
import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[0.12199777899999997,
0.11085488300000002,
0.12048885899999995,
0.12400905900000003,
0.1155322019999998,
0.11182724500000019,
0.12058627099999963,
0.11633145700000025,
0.11280813100000042,
0.11844944400000035]
# Smaller dataset, new implementation, sparse dataframe
import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[0.5350212700000001,
0.5399577099999999,
0.5390251770000001,
0.5814595669999996,
0.5923116000000004,
0.5686850980000004,
0.5839204499999999,
0.4983124719999994,
0.5384785540000001,
0.5440615109999998]
# Run 4, larger dataset, new implementation
Line # Mem usage Increment Line Contents
================================================
55 95.7 MiB 95.7 MiB @profile
56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
57 """Get frequent itemsets from a one-hot DataFrame
58 Parameters
59 -----------
60 df : pandas DataFrame or pandas SparseDataFrame
61 pandas DataFrame the encoded format.
62 The allowed values are either 0/1 or True/False.
63 For example,
64
65 ```
66 Apple Bananas Beer Chicken Milk Rice
67 0 1 0 1 1 0 1
68 1 1 0 1 0 0 1
69 2 1 0 1 0 0 0
70 3 1 1 0 0 0 0
71 4 0 0 1 1 1 1
72 5 0 0 1 0 1 1
73 6 0 0 1 0 1 0
74 7 1 1 0 0 0 0
75 ```
76
77 min_support : float (default: 0.5)
78 A float between 0 and 1 for minumum support of the itemsets returned.
79 The support is computed as the fraction
80 transactions_where_item(s)_occur / total_transactions.
81
82 use_colnames : bool (default: False)
83 If true, uses the DataFrames' column names in the returned DataFrame
84 instead of column indices.
85
86 max_len : int (default: None)
87 Maximum length of the itemsets generated. If `None` (default) all
88 possible itemsets lengths (under the apriori condition) are evaluated.
89
90 verbose : int (default: 0)
91 Shows the number of iterations if 1.
92
93 Returns
94 -----------
95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
96 that are >= `min_support` and < than `max_len`
97 (if `max_len` is not None).
98 Each itemset in the 'itemsets' column is of type `frozenset`,
99 which is a Python built-in type that behaves similarly to
100 sets except that it is immutable
101 (For more info, see
102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
103
104 Examples
105 -----------
106 For usage examples, please see
107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
108
109 """
110
111 118.2 MiB 2.9 MiB def _support(_x, _n_rows, _is_sparse):
112 """
113 DRY local method to calculate support as the row-wise sum of values / number of rows
114 :param _x: matrix of bools or binary
115 :param _n_rows: numeric, number of rows
116 :param _is_sparse: bool True if _x is sparse
117 :return: np.array, shape = (n_rows, )
118 """
119 118.2 MiB 0.0 MiB out = (np.sum(_x, axis=0) / _n_rows)
120 118.2 MiB 0.0 MiB return np.array(out).reshape(-1)
121
122 112.4 MiB 16.7 MiB idxs = np.where((df.values != 1) & (df.values != 0))
123 112.4 MiB 0.0 MiB if len(idxs[0]) > 0:
124 val = df.values[idxs[0][0], idxs[1][0]]
125 s = ('The allowed values for a DataFrame'
126 ' are True, False, 0, 1. Found value %s' % (val))
127 raise ValueError(s)
128
129 112.4 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
130 112.4 MiB 0.0 MiB if is_sparse:
131 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
132 raise ValueError('Due to current limitations in Pandas, '
133 'if the SparseDataFrame has integer column names,'
134 'names, please make sure they either start '
135 'with `0` or cast them as string column names: '
136 '`df.columns = [str(i) for i in df.columns`].')
137
138 X = df.to_coo().tocsc()
139 support = _support(X, X.shape[0], is_sparse)
140 else:
141 112.4 MiB 0.0 MiB X = df.values
142 112.4 MiB 0.0 MiB support = _support(X, X.shape[0], is_sparse)
143
144 112.4 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
145 112.4 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
146 112.4 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
147 112.4 MiB 0.0 MiB max_itemset = 1
148 112.4 MiB 0.0 MiB rows_count = float(X.shape[0])
149
150 117.7 MiB 0.0 MiB while max_itemset and max_itemset < (max_len or float('inf')):
151 117.7 MiB 0.0 MiB next_max_itemset = max_itemset + 1
152 117.7 MiB 0.0 MiB combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset])))
153
154 117.7 MiB 0.0 MiB if combin.size == 0:
155 break
156
157 117.7 MiB 0.0 MiB if is_sparse:
158 all_ones = np.ones((int(rows_count), 1))
159 _bools = X[:, combin[:, 0]] == all_ones
160 for n in range(1, combin.shape[1]):
161 _bools = _bools & (X[:, combin[:, n]] == all_ones)
162 else:
163 117.8 MiB 2.9 MiB _bools = np.all(X[:, combin], axis=2)
164
165 117.8 MiB 0.0 MiB support = _support(np.array(_bools), rows_count, is_sparse)
166 117.8 MiB 0.0 MiB _mask = (support >= min_support).reshape(-1)
167
168 117.8 MiB 0.0 MiB if any(_mask):
169 117.7 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(combin[_mask])
170 117.7 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(support[_mask])
171 117.7 MiB 0.0 MiB max_itemset = next_max_itemset
172 else:
173 117.8 MiB 0.0 MiB break
174 117.8 MiB 0.0 MiB all_res = []
175 117.9 MiB 0.0 MiB for k in sorted(itemset_dict):
176 117.9 MiB 0.0 MiB support = pd.Series(support_dict[k])
177 117.9 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
178
179 117.9 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
180 117.9 MiB 0.0 MiB all_res.append(res)
181
182 117.9 MiB 0.0 MiB res_df = pd.concat(all_res)
183 117.9 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
184 117.9 MiB 0.0 MiB if use_colnames:
185 mapping = {idx: item for idx, item in enumerate(df.columns)}
186 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
187 mapping[i] for i in x]))
188 117.9 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
189
190 117.9 MiB 0.0 MiB if verbose:
191 print() # adds newline if verbose counter was used
192
193 117.9 MiB 0.0 MiB return res_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment