jmayse/run1_current.txt Secret

## run1_current.txt
## Run 1, smaller dataset (1k tx, 244 items), current implementation

Line #    Mem usage    Increment   Line Contents
================================================
    55     78.8 MiB     78.8 MiB   @profile
    56                             def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
    57                                 """Get frequent itemsets from a one-hot DataFrame
    58                                 Parameters
    59                                 -----------
    60                                 df : pandas DataFrame or pandas SparseDataFrame
    61                                   pandas DataFrame the encoded format.
    62                                   The allowed values are either 0/1 or True/False.
    63                                   For example,
    64
    65                                 ```
    66                                          Apple  Bananas  Beer  Chicken  Milk  Rice
    67                                     0      1        0     1        1     0     1
    68                                     1      1        0     1        0     0     1
    69                                     2      1        0     1        0     0     0
    70                                     3      1        1     0        0     0     0
    71                                     4      0        0     1        1     1     1
    72                                     5      0        0     1        0     1     1
    73                                     6      0        0     1        0     1     0
    74                                     7      1        1     0        0     0     0
    75                                 ```
    76
    77                                 min_support : float (default: 0.5)
    78                                   A float between 0 and 1 for minumum support of the itemsets returned.
    79                                   The support is computed as the fraction
    80                                   transactions_where_item(s)_occur / total_transactions.
    81
    82                                 use_colnames : bool (default: False)
    83                                   If true, uses the DataFrames' column names in the returned DataFrame
    84                                   instead of column indices.
    85
    86                                 max_len : int (default: None)
    87                                   Maximum length of the itemsets generated. If `None` (default) all
    88                                   possible itemsets lengths (under the apriori condition) are evaluated.
    89
    90                                 verbose : int (default: 0)
    91                                   Shows the number of iterations if 1.
    92
    93                                 Returns
    94                                 -----------
    95                                 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
    96                                   that are >= `min_support` and < than `max_len`
    97                                   (if `max_len` is not None).
    98                                   Each itemset in the 'itemsets' column is of type `frozenset`,
    99                                   which is a Python built-in type that behaves similarly to
   100                                   sets except that it is immutable
   101                                   (For more info, see
   102                                   https://docs.python.org/3.6/library/stdtypes.html#frozenset).
   103
   104                                 Examples
   105                                 -----------
   106                                 For usage examples, please see
   107                                 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
   108
   109                                 """
   110     79.6 MiB      0.8 MiB       idxs = np.where((df.values != 1) & (df.values != 0))
   111     79.6 MiB      0.0 MiB       if len(idxs[0]) > 0:
   112                                     val = df.values[idxs[0][0], idxs[1][0]]
   113                                     s = ('The allowed values for a DataFrame'
   114                                          ' are True, False, 0, 1. Found value %s' % (val))
   115                                     raise ValueError(s)
   116
   117     79.6 MiB      0.0 MiB       is_sparse = hasattr(df, "to_coo")
   118     79.6 MiB      0.0 MiB       if is_sparse:
   119                                     if not isinstance(df.columns[0], str) and df.columns[0] != 0:
   120                                         raise ValueError('Due to current limitations in Pandas, '
   121                                                          'if the SparseDataFrame has integer column names,'
   122                                                          'names, please make sure they either start '
   123                                                          'with `0` or cast them as string column names: '
   124                                                          '`df.columns = [str(i) for i in df.columns`].')
   125
   126                                     X = df.to_coo().tocsc()
   127                                     support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1)
   128                                 else:
   129     79.6 MiB      0.0 MiB           X = df.values
   130     79.6 MiB      0.0 MiB           support = (np.sum(X, axis=0) / float(X.shape[0]))
   131
   132     79.6 MiB      0.0 MiB       ary_col_idx = np.arange(X.shape[1])
   133     79.6 MiB      0.0 MiB       support_dict = {1: support[support >= min_support]}
   134     79.6 MiB      0.0 MiB       itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
   135     79.6 MiB      0.0 MiB       max_itemset = 1
   136     79.6 MiB      0.0 MiB       rows_count = float(X.shape[0])
   137
   138     79.6 MiB      0.0 MiB       if max_len is None:
   139     79.6 MiB      0.0 MiB           max_len = float('inf')
   140
   141     79.6 MiB      0.0 MiB       iter_count = 0
   142
   143     79.7 MiB      0.0 MiB       while max_itemset and max_itemset < max_len:
   144     79.7 MiB      0.0 MiB           next_max_itemset = max_itemset + 1
   145     79.7 MiB      0.0 MiB           combin = generate_new_combinations(itemset_dict[max_itemset])
   146     79.7 MiB      0.0 MiB           frequent_items = []
   147     79.7 MiB      0.0 MiB           frequent_items_support = []
   148
   149     79.7 MiB      0.0 MiB           if is_sparse:
   150                                         all_ones = np.ones((X.shape[0], next_max_itemset))
   151     79.7 MiB      0.0 MiB           for c in combin:
   152     79.7 MiB      0.0 MiB               if verbose:
   153                                             iter_count += 1
   154                                             print('\rIteration: %d | Sampling itemset size %d' %
   155                                                   (iter_count, next_max_itemset), end="")
   156     79.7 MiB      0.0 MiB               if is_sparse:
   157                                             together = np.all(X[:, c] == all_ones, axis=1)
   158                                         else:
   159     79.7 MiB      0.0 MiB                   together = X[:, c].all(axis=1)
   160     79.7 MiB      0.0 MiB               support = together.sum() / rows_count
   161     79.7 MiB      0.0 MiB               if support >= min_support:
   162     79.7 MiB      0.0 MiB                   frequent_items.append(c)
   163     79.7 MiB      0.0 MiB                   frequent_items_support.append(support)
   164
   165     79.7 MiB      0.0 MiB           if frequent_items:
   166     79.7 MiB      0.0 MiB               itemset_dict[next_max_itemset] = np.array(frequent_items)
   167     79.7 MiB      0.0 MiB               support_dict[next_max_itemset] = np.array(frequent_items_support)
   168     79.7 MiB      0.0 MiB               max_itemset = next_max_itemset
   169                                     else:
   170     79.7 MiB      0.0 MiB               max_itemset = 0
   171
   172     79.7 MiB      0.0 MiB       all_res = []
   173     79.7 MiB      0.0 MiB       for k in sorted(itemset_dict):
   174     79.7 MiB      0.0 MiB           support = pd.Series(support_dict[k])
   175     79.7 MiB      0.0 MiB           itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
   176
   177     79.7 MiB      0.0 MiB           res = pd.concat((support, itemsets), axis=1)
   178     79.7 MiB      0.0 MiB           all_res.append(res)
   179
   180     79.8 MiB      0.0 MiB       res_df = pd.concat(all_res)
   181     79.8 MiB      0.0 MiB       res_df.columns = ['support', 'itemsets']
   182     79.8 MiB      0.0 MiB       if use_colnames:
   183                                     mapping = {idx: item for idx, item in enumerate(df.columns)}
   184                                     res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
   185                                                                                   mapping[i] for i in x]))
   186     79.8 MiB      0.0 MiB       res_df = res_df.reset_index(drop=True)
   187
   188     79.8 MiB      0.0 MiB       if verbose:
   189                                     print()  # adds newline if verbose counter was used
   190
   191     79.8 MiB      0.0 MiB       return res_df


## run1_current_time_dense.py
# Smaller dataset, current implementaton, dense dataframe

import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[0.3362377780000001,
 0.34670980900000004,
 0.3500013879999999,
 0.3401237049999999,
 0.34403185000000036,
 0.34166972399999995,
 0.3381037950000003,
 0.34704037900000007,
 0.35623237099999994,
 0.35210514500000034]

## run2_current.txt
# Run 2, larger dataset (8.6k tx, 966 items), current implementation

   /usr/local/bin/python3.7 /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py
Filename: /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py

Line #    Mem usage    Increment   Line Contents
================================================
    55     95.7 MiB     95.7 MiB   @profile
    56                             def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
    57                                 """Get frequent itemsets from a one-hot DataFrame
    58                                 Parameters
    59                                 -----------
    60                                 df : pandas DataFrame or pandas SparseDataFrame
    61                                   pandas DataFrame the encoded format.
    62                                   The allowed values are either 0/1 or True/False.
    63                                   For example,
    64
    65                                 ```
    66                                          Apple  Bananas  Beer  Chicken  Milk  Rice
    67                                     0      1        0     1        1     0     1
    68                                     1      1        0     1        0     0     1
    69                                     2      1        0     1        0     0     0
    70                                     3      1        1     0        0     0     0
    71                                     4      0        0     1        1     1     1
    72                                     5      0        0     1        0     1     1
    73                                     6      0        0     1        0     1     0
    74                                     7      1        1     0        0     0     0
    75                                 ```
    76
    77                                 min_support : float (default: 0.5)
    78                                   A float between 0 and 1 for minumum support of the itemsets returned.
    79                                   The support is computed as the fraction
    80                                   transactions_where_item(s)_occur / total_transactions.
    81
    82                                 use_colnames : bool (default: False)
    83                                   If true, uses the DataFrames' column names in the returned DataFrame
    84                                   instead of column indices.
    85
    86                                 max_len : int (default: None)
    87                                   Maximum length of the itemsets generated. If `None` (default) all
    88                                   possible itemsets lengths (under the apriori condition) are evaluated.
    89
    90                                 verbose : int (default: 0)
    91                                   Shows the number of iterations if 1.
    92
    93                                 Returns
    94                                 -----------
    95                                 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
    96                                   that are >= `min_support` and < than `max_len`
    97                                   (if `max_len` is not None).
    98                                   Each itemset in the 'itemsets' column is of type `frozenset`,
    99                                   which is a Python built-in type that behaves similarly to
   100                                   sets except that it is immutable
   101                                   (For more info, see
   102                                   https://docs.python.org/3.6/library/stdtypes.html#frozenset).
   103
   104                                 Examples
   105                                 -----------
   106                                 For usage examples, please see
   107                                 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
   108
   109                                 """
   110    112.4 MiB     16.7 MiB       idxs = np.where((df.values != 1) & (df.values != 0))
   111    112.4 MiB      0.0 MiB       if len(idxs[0]) > 0:
   112                                     val = df.values[idxs[0][0], idxs[1][0]]
   113                                     s = ('The allowed values for a DataFrame'
   114                                          ' are True, False, 0, 1. Found value %s' % (val))
   115                                     raise ValueError(s)
   116
   117    112.4 MiB      0.0 MiB       is_sparse = hasattr(df, "to_coo")
   118    112.4 MiB      0.0 MiB       if is_sparse:
   119                                     if not isinstance(df.columns[0], str) and df.columns[0] != 0:
   120                                         raise ValueError('Due to current limitations in Pandas, '
   121                                                          'if the SparseDataFrame has integer column names,'
   122                                                          'names, please make sure they either start '
   123                                                          'with `0` or cast them as string column names: '
   124                                                          '`df.columns = [str(i) for i in df.columns`].')
   125
   126                                     X = df.to_coo().tocsc()
   127                                     support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1)
   128                                 else:
   129    112.4 MiB      0.0 MiB           X = df.values
   130    112.4 MiB      0.0 MiB           support = (np.sum(X, axis=0) / float(X.shape[0]))
   131
   132    112.4 MiB      0.0 MiB       ary_col_idx = np.arange(X.shape[1])
   133    112.4 MiB      0.0 MiB       support_dict = {1: support[support >= min_support]}
   134    112.4 MiB      0.0 MiB       itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
   135    112.4 MiB      0.0 MiB       max_itemset = 1
   136    112.4 MiB      0.0 MiB       rows_count = float(X.shape[0])
   137
   138    112.4 MiB      0.0 MiB       if max_len is None:
   139    112.4 MiB      0.0 MiB           max_len = float('inf')
   140
   141    112.4 MiB      0.0 MiB       iter_count = 0
   142
   143    112.5 MiB      0.0 MiB       while max_itemset and max_itemset < max_len:
   144    112.5 MiB      0.0 MiB           next_max_itemset = max_itemset + 1
   145    112.5 MiB      0.0 MiB           combin = generate_new_combinations(itemset_dict[max_itemset])
   146    112.5 MiB      0.0 MiB           frequent_items = []
   147    112.5 MiB      0.0 MiB           frequent_items_support = []
   148
   149    112.5 MiB      0.0 MiB           if is_sparse:
   150                                         all_ones = np.ones((X.shape[0], next_max_itemset))
   151    112.5 MiB      0.0 MiB           for c in combin:
   152    112.5 MiB      0.0 MiB               if verbose:
   153                                             iter_count += 1
   154                                             print('\rIteration: %d | Sampling itemset size %d' %
   155                                                   (iter_count, next_max_itemset), end="")
   156    112.5 MiB      0.0 MiB               if is_sparse:
   157                                             together = np.all(X[:, c] == all_ones, axis=1)
   158                                         else:
   159    112.5 MiB      0.0 MiB                   together = X[:, c].all(axis=1)
   160    112.5 MiB      0.0 MiB               support = together.sum() / rows_count
   161    112.5 MiB      0.0 MiB               if support >= min_support:
   162    112.5 MiB      0.0 MiB                   frequent_items.append(c)
   163    112.5 MiB      0.0 MiB                   frequent_items_support.append(support)
   164
   165    112.5 MiB      0.0 MiB           if frequent_items:
   166    112.5 MiB      0.0 MiB               itemset_dict[next_max_itemset] = np.array(frequent_items)
   167    112.5 MiB      0.0 MiB               support_dict[next_max_itemset] = np.array(frequent_items_support)
   168    112.5 MiB      0.0 MiB               max_itemset = next_max_itemset
   169                                     else:
   170    112.5 MiB      0.0 MiB               max_itemset = 0
   171
   172    112.5 MiB      0.0 MiB       all_res = []
   173    112.5 MiB      0.0 MiB       for k in sorted(itemset_dict):
   174    112.5 MiB      0.0 MiB           support = pd.Series(support_dict[k])
   175    112.5 MiB      0.0 MiB           itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
   176
   177    112.5 MiB      0.0 MiB           res = pd.concat((support, itemsets), axis=1)
   178    112.5 MiB      0.0 MiB           all_res.append(res)
   179
   180    112.6 MiB      0.0 MiB       res_df = pd.concat(all_res)
   181    112.6 MiB      0.0 MiB       res_df.columns = ['support', 'itemsets']
   182    112.6 MiB      0.0 MiB       if use_colnames:
   183                                     mapping = {idx: item for idx, item in enumerate(df.columns)}
   184                                     res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
   185                                                                                   mapping[i] for i in x]))
   186    112.6 MiB      0.0 MiB       res_df = res_df.reset_index(drop=True)
   187
   188    112.6 MiB      0.0 MiB       if verbose:
   189                                     print()  # adds newline if verbose counter was used
   190
   191    112.6 MiB      0.0 MiB       return res_df

## run2_current_time_sparse.py
# Smaller dataset, current implementation, sparse dataframe

import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[3.3068226319999994,
 3.2766413269999983,
 3.3034355029999993,
 3.4978498950000017,
 3.502342746,
 3.5774782919999986,
 3.6397878360000036,
 3.361885745000002,
 3.2973721370000035,
 3.293594581999997]

## run3_new.txt
# Run 3, small dataset, new implementation

Line #    Mem usage    Increment   Line Contents
================================================
    55     78.8 MiB     78.8 MiB   @profile
    56                             def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
    57                                 """Get frequent itemsets from a one-hot DataFrame
    58                                 Parameters
    59                                 -----------
    60                                 df : pandas DataFrame or pandas SparseDataFrame
    61                                   pandas DataFrame the encoded format.
    62                                   The allowed values are either 0/1 or True/False.
    63                                   For example,
    64
    65                                 ```
    66                                          Apple  Bananas  Beer  Chicken  Milk  Rice
    67                                     0      1        0     1        1     0     1
    68                                     1      1        0     1        0     0     1
    69                                     2      1        0     1        0     0     0
    70                                     3      1        1     0        0     0     0
    71                                     4      0        0     1        1     1     1
    72                                     5      0        0     1        0     1     1
    73                                     6      0        0     1        0     1     0
    74                                     7      1        1     0        0     0     0
    75                                 ```
    76
    77                                 min_support : float (default: 0.5)
    78                                   A float between 0 and 1 for minumum support of the itemsets returned.
    79                                   The support is computed as the fraction
    80                                   transactions_where_item(s)_occur / total_transactions.
    81
    82                                 use_colnames : bool (default: False)
    83                                   If true, uses the DataFrames' column names in the returned DataFrame
    84                                   instead of column indices.
    85
    86                                 max_len : int (default: None)
    87                                   Maximum length of the itemsets generated. If `None` (default) all
    88                                   possible itemsets lengths (under the apriori condition) are evaluated.
    89
    90                                 verbose : int (default: 0)
    91                                   Shows the number of iterations if 1.
    92
    93                                 Returns
    94                                 -----------
    95                                 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
    96                                   that are >= `min_support` and < than `max_len`
    97                                   (if `max_len` is not None).
    98                                   Each itemset in the 'itemsets' column is of type `frozenset`,
    99                                   which is a Python built-in type that behaves similarly to
   100                                   sets except that it is immutable
   101                                   (For more info, see
   102                                   https://docs.python.org/3.6/library/stdtypes.html#frozenset).
   103
   104                                 Examples
   105                                 -----------
   106                                 For usage examples, please see
   107                                 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
   108
   109                                 """
   110
   111     84.7 MiB      0.9 MiB       def _support(_x, _n_rows, _is_sparse):
   112                                     """
   113                                     DRY local method to calculate support as the row-wise sum of values / number of rows
   114                                     :param _x: matrix of bools or binary
   115                                     :param _n_rows: numeric, number of rows
   116                                     :param _is_sparse: bool True if _x is sparse
   117                                     :return: np.array, shape = (n_rows, )
   118                                     """
   119     84.7 MiB      0.0 MiB           out = (np.sum(_x, axis=0) / _n_rows)
   120     84.7 MiB      0.0 MiB           return np.array(out).reshape(-1)
   121
   122     79.6 MiB      0.8 MiB       idxs = np.where((df.values != 1) & (df.values != 0))
   123     79.6 MiB      0.0 MiB       if len(idxs[0]) > 0:
   124                                     val = df.values[idxs[0][0], idxs[1][0]]
   125                                     s = ('The allowed values for a DataFrame'
   126                                          ' are True, False, 0, 1. Found value %s' % (val))
   127                                     raise ValueError(s)
   128
   129     79.6 MiB      0.0 MiB       is_sparse = hasattr(df, "to_coo")
   130     79.6 MiB      0.0 MiB       if is_sparse:
   131                                     if not isinstance(df.columns[0], str) and df.columns[0] != 0:
   132                                         raise ValueError('Due to current limitations in Pandas, '
   133                                                          'if the SparseDataFrame has integer column names,'
   134                                                          'names, please make sure they either start '
   135                                                          'with `0` or cast them as string column names: '
   136                                                          '`df.columns = [str(i) for i in df.columns`].')
   137
   138                                     X = df.to_coo().tocsc()
   139                                     support = _support(X, X.shape[0], is_sparse)
   140                                 else:
   141     79.6 MiB      0.0 MiB           X = df.values
   142     79.6 MiB      0.0 MiB           support = _support(X, X.shape[0], is_sparse)
   143
   144     79.6 MiB      0.0 MiB       ary_col_idx = np.arange(X.shape[1])
   145     79.6 MiB      0.0 MiB       support_dict = {1: support[support >= min_support]}
   146     79.6 MiB      0.0 MiB       itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
   147     79.6 MiB      0.0 MiB       max_itemset = 1
   148     79.6 MiB      0.0 MiB       rows_count = float(X.shape[0])
   149
   150     84.2 MiB      0.0 MiB       while max_itemset and max_itemset < (max_len or float('inf')):
   151     84.2 MiB      0.0 MiB           next_max_itemset = max_itemset + 1
   152     84.2 MiB      0.0 MiB           combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset])))
   153
   154     84.2 MiB      0.0 MiB           if combin.size == 0:
   155                                         break
   156
   157     84.2 MiB      0.0 MiB           if is_sparse:
   158                                         all_ones = np.ones((int(rows_count), 1))
   159                                         _bools = X[:, combin[:, 0]] == all_ones
   160                                         for n in range(1, combin.shape[1]):
   161                                             _bools = _bools & (X[:, combin[:, n]] == all_ones)
   162                                     else:
   163     84.2 MiB      2.2 MiB               _bools = np.all(X[:, combin], axis=2)
   164
   165     84.2 MiB      0.0 MiB           support = _support(np.array(_bools), rows_count, is_sparse)
   166     84.2 MiB      0.0 MiB           _mask = (support >= min_support).reshape(-1)
   167
   168     84.2 MiB      0.0 MiB           if any(_mask):
   169     84.2 MiB      0.0 MiB               itemset_dict[next_max_itemset] = np.array(combin[_mask])
   170     84.2 MiB      0.0 MiB               support_dict[next_max_itemset] = np.array(support[_mask])
   171     84.2 MiB      0.0 MiB               max_itemset = next_max_itemset
   172                                     else:
   173     83.8 MiB      0.0 MiB               break
   174     83.8 MiB      0.0 MiB       all_res = []
   175     83.8 MiB      0.0 MiB       for k in sorted(itemset_dict):
   176     83.8 MiB      0.0 MiB           support = pd.Series(support_dict[k])
   177     83.8 MiB      0.0 MiB           itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
   178
   179     83.8 MiB      0.0 MiB           res = pd.concat((support, itemsets), axis=1)
   180     83.8 MiB      0.0 MiB           all_res.append(res)
   181
   182     83.9 MiB      0.0 MiB       res_df = pd.concat(all_res)
   183     83.9 MiB      0.0 MiB       res_df.columns = ['support', 'itemsets']
   184     83.9 MiB      0.0 MiB       if use_colnames:
   185                                     mapping = {idx: item for idx, item in enumerate(df.columns)}
   186                                     res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
   187                                                                                   mapping[i] for i in x]))
   188     83.9 MiB      0.0 MiB       res_df = res_df.reset_index(drop=True)
   189
   190     83.9 MiB      0.0 MiB       if verbose:
   191                                     print()  # adds newline if verbose counter was used
   192
   193     83.9 MiB      0.0 MiB       return res_df

## run3_new_time_dense.py
# Smaller dataset, new implementation, dense dataframe

import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.DataFrame(_items, columns=te.columns_)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[0.12199777899999997,
 0.11085488300000002,
 0.12048885899999995,
 0.12400905900000003,
 0.1155322019999998,
 0.11182724500000019,
 0.12058627099999963,
 0.11633145700000025,
 0.11280813100000042,
 0.11844944400000035]

## run3_new_time_sparse.py
# Smaller dataset, new implementation, sparse dataframe

import pandas as pd
df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0)
from timeit import repeat
from mlxtend.frequent_patterns.apriori import apriori
repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
Out[2]:
[0.5350212700000001,
 0.5399577099999999,
 0.5390251770000001,
 0.5814595669999996,
 0.5923116000000004,
 0.5686850980000004,
 0.5839204499999999,
 0.4983124719999994,
 0.5384785540000001,
 0.5440615109999998]

## run4_new.txt
# Run 4, larger dataset, new implementation

Line #    Mem usage    Increment   Line Contents
================================================
    55     95.7 MiB     95.7 MiB   @profile
    56                             def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
    57                                 """Get frequent itemsets from a one-hot DataFrame
    58                                 Parameters
    59                                 -----------
    60                                 df : pandas DataFrame or pandas SparseDataFrame
    61                                   pandas DataFrame the encoded format.
    62                                   The allowed values are either 0/1 or True/False.
    63                                   For example,
    64
    65                                 ```
    66                                          Apple  Bananas  Beer  Chicken  Milk  Rice
    67                                     0      1        0     1        1     0     1
    68                                     1      1        0     1        0     0     1
    69                                     2      1        0     1        0     0     0
    70                                     3      1        1     0        0     0     0
    71                                     4      0        0     1        1     1     1
    72                                     5      0        0     1        0     1     1
    73                                     6      0        0     1        0     1     0
    74                                     7      1        1     0        0     0     0
    75                                 ```
    76
    77                                 min_support : float (default: 0.5)
    78                                   A float between 0 and 1 for minumum support of the itemsets returned.
    79                                   The support is computed as the fraction
    80                                   transactions_where_item(s)_occur / total_transactions.
    81
    82                                 use_colnames : bool (default: False)
    83                                   If true, uses the DataFrames' column names in the returned DataFrame
    84                                   instead of column indices.
    85
    86                                 max_len : int (default: None)
    87                                   Maximum length of the itemsets generated. If `None` (default) all
    88                                   possible itemsets lengths (under the apriori condition) are evaluated.
    89
    90                                 verbose : int (default: 0)
    91                                   Shows the number of iterations if 1.
    92
    93                                 Returns
    94                                 -----------
    95                                 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
    96                                   that are >= `min_support` and < than `max_len`
    97                                   (if `max_len` is not None).
    98                                   Each itemset in the 'itemsets' column is of type `frozenset`,
    99                                   which is a Python built-in type that behaves similarly to
   100                                   sets except that it is immutable
   101                                   (For more info, see
   102                                   https://docs.python.org/3.6/library/stdtypes.html#frozenset).
   103
   104                                 Examples
   105                                 -----------
   106                                 For usage examples, please see
   107                                 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
   108
   109                                 """
   110
   111    118.2 MiB      2.9 MiB       def _support(_x, _n_rows, _is_sparse):
   112                                     """
   113                                     DRY local method to calculate support as the row-wise sum of values / number of rows
   114                                     :param _x: matrix of bools or binary
   115                                     :param _n_rows: numeric, number of rows
   116                                     :param _is_sparse: bool True if _x is sparse
   117                                     :return: np.array, shape = (n_rows, )
   118                                     """
   119    118.2 MiB      0.0 MiB           out = (np.sum(_x, axis=0) / _n_rows)
   120    118.2 MiB      0.0 MiB           return np.array(out).reshape(-1)
   121
   122    112.4 MiB     16.7 MiB       idxs = np.where((df.values != 1) & (df.values != 0))
   123    112.4 MiB      0.0 MiB       if len(idxs[0]) > 0:
   124                                     val = df.values[idxs[0][0], idxs[1][0]]
   125                                     s = ('The allowed values for a DataFrame'
   126                                          ' are True, False, 0, 1. Found value %s' % (val))
   127                                     raise ValueError(s)
   128
   129    112.4 MiB      0.0 MiB       is_sparse = hasattr(df, "to_coo")
   130    112.4 MiB      0.0 MiB       if is_sparse:
   131                                     if not isinstance(df.columns[0], str) and df.columns[0] != 0:
   132                                         raise ValueError('Due to current limitations in Pandas, '
   133                                                          'if the SparseDataFrame has integer column names,'
   134                                                          'names, please make sure they either start '
   135                                                          'with `0` or cast them as string column names: '
   136                                                          '`df.columns = [str(i) for i in df.columns`].')
   137
   138                                     X = df.to_coo().tocsc()
   139                                     support = _support(X, X.shape[0], is_sparse)
   140                                 else:
   141    112.4 MiB      0.0 MiB           X = df.values
   142    112.4 MiB      0.0 MiB           support = _support(X, X.shape[0], is_sparse)
   143
   144    112.4 MiB      0.0 MiB       ary_col_idx = np.arange(X.shape[1])
   145    112.4 MiB      0.0 MiB       support_dict = {1: support[support >= min_support]}
   146    112.4 MiB      0.0 MiB       itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
   147    112.4 MiB      0.0 MiB       max_itemset = 1
   148    112.4 MiB      0.0 MiB       rows_count = float(X.shape[0])
   149
   150    117.7 MiB      0.0 MiB       while max_itemset and max_itemset < (max_len or float('inf')):
   151    117.7 MiB      0.0 MiB           next_max_itemset = max_itemset + 1
   152    117.7 MiB      0.0 MiB           combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset])))
   153
   154    117.7 MiB      0.0 MiB           if combin.size == 0:
   155                                         break
   156
   157    117.7 MiB      0.0 MiB           if is_sparse:
   158                                         all_ones = np.ones((int(rows_count), 1))
   159                                         _bools = X[:, combin[:, 0]] == all_ones
   160                                         for n in range(1, combin.shape[1]):
   161                                             _bools = _bools & (X[:, combin[:, n]] == all_ones)
   162                                     else:
   163    117.8 MiB      2.9 MiB               _bools = np.all(X[:, combin], axis=2)
   164
   165    117.8 MiB      0.0 MiB           support = _support(np.array(_bools), rows_count, is_sparse)
   166    117.8 MiB      0.0 MiB           _mask = (support >= min_support).reshape(-1)
   167
   168    117.8 MiB      0.0 MiB           if any(_mask):
   169    117.7 MiB      0.0 MiB               itemset_dict[next_max_itemset] = np.array(combin[_mask])
   170    117.7 MiB      0.0 MiB               support_dict[next_max_itemset] = np.array(support[_mask])
   171    117.7 MiB      0.0 MiB               max_itemset = next_max_itemset
   172                                     else:
   173    117.8 MiB      0.0 MiB               break
   174    117.8 MiB      0.0 MiB       all_res = []
   175    117.9 MiB      0.0 MiB       for k in sorted(itemset_dict):
   176    117.9 MiB      0.0 MiB           support = pd.Series(support_dict[k])
   177    117.9 MiB      0.0 MiB           itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
   178
   179    117.9 MiB      0.0 MiB           res = pd.concat((support, itemsets), axis=1)
   180    117.9 MiB      0.0 MiB           all_res.append(res)
   181
   182    117.9 MiB      0.0 MiB       res_df = pd.concat(all_res)
   183    117.9 MiB      0.0 MiB       res_df.columns = ['support', 'itemsets']
   184    117.9 MiB      0.0 MiB       if use_colnames:
   185                                     mapping = {idx: item for idx, item in enumerate(df.columns)}
   186                                     res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
   187                                                                                   mapping[i] for i in x]))
   188    117.9 MiB      0.0 MiB       res_df = res_df.reset_index(drop=True)
   189
   190    117.9 MiB      0.0 MiB       if verbose:
   191                                     print()  # adds newline if verbose counter was used
   192
   193    117.9 MiB      0.0 MiB       return res_df
	## Run 1, smaller dataset (1k tx, 244 items), current implementation

	Line # Mem usage Increment Line Contents
	================================================
	55 78.8 MiB 78.8 MiB @profile
	56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
	57 """Get frequent itemsets from a one-hot DataFrame
	58 Parameters
	59 -----------
	60 df : pandas DataFrame or pandas SparseDataFrame
	61 pandas DataFrame the encoded format.
	62 The allowed values are either 0/1 or True/False.
	63 For example,
	64
	65 ```
	66 Apple Bananas Beer Chicken Milk Rice
	67 0 1 0 1 1 0 1
	68 1 1 0 1 0 0 1
	69 2 1 0 1 0 0 0
	70 3 1 1 0 0 0 0
	71 4 0 0 1 1 1 1
	72 5 0 0 1 0 1 1
	73 6 0 0 1 0 1 0
	74 7 1 1 0 0 0 0
	75 ```
	76
	77 min_support : float (default: 0.5)
	78 A float between 0 and 1 for minumum support of the itemsets returned.
	79 The support is computed as the fraction
	80 transactions_where_item(s)_occur / total_transactions.
	81
	82 use_colnames : bool (default: False)
	83 If true, uses the DataFrames' column names in the returned DataFrame
	84 instead of column indices.
	85
	86 max_len : int (default: None)
	87 Maximum length of the itemsets generated. If `None` (default) all
	88 possible itemsets lengths (under the apriori condition) are evaluated.
	89
	90 verbose : int (default: 0)
	91 Shows the number of iterations if 1.
	92
	93 Returns
	94 -----------
	95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
	96 that are >= `min_support` and < than `max_len`
	97 (if `max_len` is not None).
	98 Each itemset in the 'itemsets' column is of type `frozenset`,
	99 which is a Python built-in type that behaves similarly to
	100 sets except that it is immutable
	101 (For more info, see
	102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
	103
	104 Examples
	105 -----------
	106 For usage examples, please see
	107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
	108
	109 """
	110 79.6 MiB 0.8 MiB idxs = np.where((df.values != 1) & (df.values != 0))
	111 79.6 MiB 0.0 MiB if len(idxs[0]) > 0:
	112 val = df.values[idxs[0][0], idxs[1][0]]
	113 s = ('The allowed values for a DataFrame'
	114 ' are True, False, 0, 1. Found value %s' % (val))
	115 raise ValueError(s)
	116
	117 79.6 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
	118 79.6 MiB 0.0 MiB if is_sparse:
	119 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
	120 raise ValueError('Due to current limitations in Pandas, '
	121 'if the SparseDataFrame has integer column names,'
	122 'names, please make sure they either start '
	123 'with `0` or cast them as string column names: '
	124 '`df.columns = [str(i) for i in df.columns`].')
	125
	126 X = df.to_coo().tocsc()
	127 support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1)
	128 else:
	129 79.6 MiB 0.0 MiB X = df.values
	130 79.6 MiB 0.0 MiB support = (np.sum(X, axis=0) / float(X.shape[0]))
	131
	132 79.6 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
	133 79.6 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
	134 79.6 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
	135 79.6 MiB 0.0 MiB max_itemset = 1
	136 79.6 MiB 0.0 MiB rows_count = float(X.shape[0])
	137
	138 79.6 MiB 0.0 MiB if max_len is None:
	139 79.6 MiB 0.0 MiB max_len = float('inf')
	140
	141 79.6 MiB 0.0 MiB iter_count = 0
	142
	143 79.7 MiB 0.0 MiB while max_itemset and max_itemset < max_len:
	144 79.7 MiB 0.0 MiB next_max_itemset = max_itemset + 1
	145 79.7 MiB 0.0 MiB combin = generate_new_combinations(itemset_dict[max_itemset])
	146 79.7 MiB 0.0 MiB frequent_items = []
	147 79.7 MiB 0.0 MiB frequent_items_support = []
	148
	149 79.7 MiB 0.0 MiB if is_sparse:
	150 all_ones = np.ones((X.shape[0], next_max_itemset))
	151 79.7 MiB 0.0 MiB for c in combin:
	152 79.7 MiB 0.0 MiB if verbose:
	153 iter_count += 1
	154 print('\rIteration: %d \| Sampling itemset size %d' %
	155 (iter_count, next_max_itemset), end="")
	156 79.7 MiB 0.0 MiB if is_sparse:
	157 together = np.all(X[:, c] == all_ones, axis=1)
	158 else:
	159 79.7 MiB 0.0 MiB together = X[:, c].all(axis=1)
	160 79.7 MiB 0.0 MiB support = together.sum() / rows_count
	161 79.7 MiB 0.0 MiB if support >= min_support:
	162 79.7 MiB 0.0 MiB frequent_items.append(c)
	163 79.7 MiB 0.0 MiB frequent_items_support.append(support)
	164
	165 79.7 MiB 0.0 MiB if frequent_items:
	166 79.7 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(frequent_items)
	167 79.7 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(frequent_items_support)
	168 79.7 MiB 0.0 MiB max_itemset = next_max_itemset
	169 else:
	170 79.7 MiB 0.0 MiB max_itemset = 0
	171
	172 79.7 MiB 0.0 MiB all_res = []
	173 79.7 MiB 0.0 MiB for k in sorted(itemset_dict):
	174 79.7 MiB 0.0 MiB support = pd.Series(support_dict[k])
	175 79.7 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
	176
	177 79.7 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
	178 79.7 MiB 0.0 MiB all_res.append(res)
	179
	180 79.8 MiB 0.0 MiB res_df = pd.concat(all_res)
	181 79.8 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
	182 79.8 MiB 0.0 MiB if use_colnames:
	183 mapping = {idx: item for idx, item in enumerate(df.columns)}
	184 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
	185 mapping[i] for i in x]))
	186 79.8 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
	187
	188 79.8 MiB 0.0 MiB if verbose:
	189 print() # adds newline if verbose counter was used
	190
	191 79.8 MiB 0.0 MiB return res_df
	# Smaller dataset, current implementaton, dense dataframe

	import pandas as pd
	df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
	from mlxtend.preprocessing import TransactionEncoder
	te = TransactionEncoder()
	_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
	items = pd.DataFrame(_items, columns=te.columns_)
	from timeit import repeat
	from mlxtend.frequent_patterns.apriori import apriori
	repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
	Out[2]:
	[0.3362377780000001,
	0.34670980900000004,
	0.3500013879999999,
	0.3401237049999999,
	0.34403185000000036,
	0.34166972399999995,
	0.3381037950000003,
	0.34704037900000007,
	0.35623237099999994,
	0.35210514500000034]
	# Run 2, larger dataset (8.6k tx, 966 items), current implementation

	/usr/local/bin/python3.7 /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py
	Filename: /Projects/mlxtend/mlxtend/frequent_patterns/apriori.py

	Line # Mem usage Increment Line Contents
	================================================
	55 95.7 MiB 95.7 MiB @profile
	56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
	57 """Get frequent itemsets from a one-hot DataFrame
	58 Parameters
	59 -----------
	60 df : pandas DataFrame or pandas SparseDataFrame
	61 pandas DataFrame the encoded format.
	62 The allowed values are either 0/1 or True/False.
	63 For example,
	64
	65 ```
	66 Apple Bananas Beer Chicken Milk Rice
	67 0 1 0 1 1 0 1
	68 1 1 0 1 0 0 1
	69 2 1 0 1 0 0 0
	70 3 1 1 0 0 0 0
	71 4 0 0 1 1 1 1
	72 5 0 0 1 0 1 1
	73 6 0 0 1 0 1 0
	74 7 1 1 0 0 0 0
	75 ```
	76
	77 min_support : float (default: 0.5)
	78 A float between 0 and 1 for minumum support of the itemsets returned.
	79 The support is computed as the fraction
	80 transactions_where_item(s)_occur / total_transactions.
	81
	82 use_colnames : bool (default: False)
	83 If true, uses the DataFrames' column names in the returned DataFrame
	84 instead of column indices.
	85
	86 max_len : int (default: None)
	87 Maximum length of the itemsets generated. If `None` (default) all
	88 possible itemsets lengths (under the apriori condition) are evaluated.
	89
	90 verbose : int (default: 0)
	91 Shows the number of iterations if 1.
	92
	93 Returns
	94 -----------
	95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
	96 that are >= `min_support` and < than `max_len`
	97 (if `max_len` is not None).
	98 Each itemset in the 'itemsets' column is of type `frozenset`,
	99 which is a Python built-in type that behaves similarly to
	100 sets except that it is immutable
	101 (For more info, see
	102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
	103
	104 Examples
	105 -----------
	106 For usage examples, please see
	107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
	108
	109 """
	110 112.4 MiB 16.7 MiB idxs = np.where((df.values != 1) & (df.values != 0))
	111 112.4 MiB 0.0 MiB if len(idxs[0]) > 0:
	112 val = df.values[idxs[0][0], idxs[1][0]]
	113 s = ('The allowed values for a DataFrame'
	114 ' are True, False, 0, 1. Found value %s' % (val))
	115 raise ValueError(s)
	116
	117 112.4 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
	118 112.4 MiB 0.0 MiB if is_sparse:
	119 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
	120 raise ValueError('Due to current limitations in Pandas, '
	121 'if the SparseDataFrame has integer column names,'
	122 'names, please make sure they either start '
	123 'with `0` or cast them as string column names: '
	124 '`df.columns = [str(i) for i in df.columns`].')
	125
	126 X = df.to_coo().tocsc()
	127 support = np.array(np.sum(X, axis=0) / float(X.shape[0])).reshape(-1)
	128 else:
	129 112.4 MiB 0.0 MiB X = df.values
	130 112.4 MiB 0.0 MiB support = (np.sum(X, axis=0) / float(X.shape[0]))
	131
	132 112.4 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
	133 112.4 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
	134 112.4 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
	135 112.4 MiB 0.0 MiB max_itemset = 1
	136 112.4 MiB 0.0 MiB rows_count = float(X.shape[0])
	137
	138 112.4 MiB 0.0 MiB if max_len is None:
	139 112.4 MiB 0.0 MiB max_len = float('inf')
	140
	141 112.4 MiB 0.0 MiB iter_count = 0
	142
	143 112.5 MiB 0.0 MiB while max_itemset and max_itemset < max_len:
	144 112.5 MiB 0.0 MiB next_max_itemset = max_itemset + 1
	145 112.5 MiB 0.0 MiB combin = generate_new_combinations(itemset_dict[max_itemset])
	146 112.5 MiB 0.0 MiB frequent_items = []
	147 112.5 MiB 0.0 MiB frequent_items_support = []
	148
	149 112.5 MiB 0.0 MiB if is_sparse:
	150 all_ones = np.ones((X.shape[0], next_max_itemset))
	151 112.5 MiB 0.0 MiB for c in combin:
	152 112.5 MiB 0.0 MiB if verbose:
	153 iter_count += 1
	154 print('\rIteration: %d \| Sampling itemset size %d' %
	155 (iter_count, next_max_itemset), end="")
	156 112.5 MiB 0.0 MiB if is_sparse:
	157 together = np.all(X[:, c] == all_ones, axis=1)
	158 else:
	159 112.5 MiB 0.0 MiB together = X[:, c].all(axis=1)
	160 112.5 MiB 0.0 MiB support = together.sum() / rows_count
	161 112.5 MiB 0.0 MiB if support >= min_support:
	162 112.5 MiB 0.0 MiB frequent_items.append(c)
	163 112.5 MiB 0.0 MiB frequent_items_support.append(support)
	164
	165 112.5 MiB 0.0 MiB if frequent_items:
	166 112.5 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(frequent_items)
	167 112.5 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(frequent_items_support)
	168 112.5 MiB 0.0 MiB max_itemset = next_max_itemset
	169 else:
	170 112.5 MiB 0.0 MiB max_itemset = 0
	171
	172 112.5 MiB 0.0 MiB all_res = []
	173 112.5 MiB 0.0 MiB for k in sorted(itemset_dict):
	174 112.5 MiB 0.0 MiB support = pd.Series(support_dict[k])
	175 112.5 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
	176
	177 112.5 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
	178 112.5 MiB 0.0 MiB all_res.append(res)
	179
	180 112.6 MiB 0.0 MiB res_df = pd.concat(all_res)
	181 112.6 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
	182 112.6 MiB 0.0 MiB if use_colnames:
	183 mapping = {idx: item for idx, item in enumerate(df.columns)}
	184 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
	185 mapping[i] for i in x]))
	186 112.6 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
	187
	188 112.6 MiB 0.0 MiB if verbose:
	189 print() # adds newline if verbose counter was used
	190
	191 112.6 MiB 0.0 MiB return res_df
	# Smaller dataset, current implementation, sparse dataframe

	import pandas as pd
	df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
	from mlxtend.preprocessing import TransactionEncoder
	te = TransactionEncoder()
	_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
	items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0)
	from timeit import repeat
	from mlxtend.frequent_patterns.apriori import apriori
	repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
	Out[2]:
	[3.3068226319999994,
	3.2766413269999983,
	3.3034355029999993,
	3.4978498950000017,
	3.502342746,
	3.5774782919999986,
	3.6397878360000036,
	3.361885745000002,
	3.2973721370000035,
	3.293594581999997]
	# Run 3, small dataset, new implementation

	Line # Mem usage Increment Line Contents
	================================================
	55 78.8 MiB 78.8 MiB @profile
	56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
	57 """Get frequent itemsets from a one-hot DataFrame
	58 Parameters
	59 -----------
	60 df : pandas DataFrame or pandas SparseDataFrame
	61 pandas DataFrame the encoded format.
	62 The allowed values are either 0/1 or True/False.
	63 For example,
	64
	65 ```
	66 Apple Bananas Beer Chicken Milk Rice
	67 0 1 0 1 1 0 1
	68 1 1 0 1 0 0 1
	69 2 1 0 1 0 0 0
	70 3 1 1 0 0 0 0
	71 4 0 0 1 1 1 1
	72 5 0 0 1 0 1 1
	73 6 0 0 1 0 1 0
	74 7 1 1 0 0 0 0
	75 ```
	76
	77 min_support : float (default: 0.5)
	78 A float between 0 and 1 for minumum support of the itemsets returned.
	79 The support is computed as the fraction
	80 transactions_where_item(s)_occur / total_transactions.
	81
	82 use_colnames : bool (default: False)
	83 If true, uses the DataFrames' column names in the returned DataFrame
	84 instead of column indices.
	85
	86 max_len : int (default: None)
	87 Maximum length of the itemsets generated. If `None` (default) all
	88 possible itemsets lengths (under the apriori condition) are evaluated.
	89
	90 verbose : int (default: 0)
	91 Shows the number of iterations if 1.
	92
	93 Returns
	94 -----------
	95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
	96 that are >= `min_support` and < than `max_len`
	97 (if `max_len` is not None).
	98 Each itemset in the 'itemsets' column is of type `frozenset`,
	99 which is a Python built-in type that behaves similarly to
	100 sets except that it is immutable
	101 (For more info, see
	102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
	103
	104 Examples
	105 -----------
	106 For usage examples, please see
	107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
	108
	109 """
	110
	111 84.7 MiB 0.9 MiB def _support(_x, _n_rows, _is_sparse):
	112 """
	113 DRY local method to calculate support as the row-wise sum of values / number of rows
	114 :param _x: matrix of bools or binary
	115 :param _n_rows: numeric, number of rows
	116 :param _is_sparse: bool True if _x is sparse
	117 :return: np.array, shape = (n_rows, )
	118 """
	119 84.7 MiB 0.0 MiB out = (np.sum(_x, axis=0) / _n_rows)
	120 84.7 MiB 0.0 MiB return np.array(out).reshape(-1)
	121
	122 79.6 MiB 0.8 MiB idxs = np.where((df.values != 1) & (df.values != 0))
	123 79.6 MiB 0.0 MiB if len(idxs[0]) > 0:
	124 val = df.values[idxs[0][0], idxs[1][0]]
	125 s = ('The allowed values for a DataFrame'
	126 ' are True, False, 0, 1. Found value %s' % (val))
	127 raise ValueError(s)
	128
	129 79.6 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
	130 79.6 MiB 0.0 MiB if is_sparse:
	131 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
	132 raise ValueError('Due to current limitations in Pandas, '
	133 'if the SparseDataFrame has integer column names,'
	134 'names, please make sure they either start '
	135 'with `0` or cast them as string column names: '
	136 '`df.columns = [str(i) for i in df.columns`].')
	137
	138 X = df.to_coo().tocsc()
	139 support = _support(X, X.shape[0], is_sparse)
	140 else:
	141 79.6 MiB 0.0 MiB X = df.values
	142 79.6 MiB 0.0 MiB support = _support(X, X.shape[0], is_sparse)
	143
	144 79.6 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
	145 79.6 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
	146 79.6 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
	147 79.6 MiB 0.0 MiB max_itemset = 1
	148 79.6 MiB 0.0 MiB rows_count = float(X.shape[0])
	149
	150 84.2 MiB 0.0 MiB while max_itemset and max_itemset < (max_len or float('inf')):
	151 84.2 MiB 0.0 MiB next_max_itemset = max_itemset + 1
	152 84.2 MiB 0.0 MiB combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset])))
	153
	154 84.2 MiB 0.0 MiB if combin.size == 0:
	155 break
	156
	157 84.2 MiB 0.0 MiB if is_sparse:
	158 all_ones = np.ones((int(rows_count), 1))
	159 _bools = X[:, combin[:, 0]] == all_ones
	160 for n in range(1, combin.shape[1]):
	161 _bools = _bools & (X[:, combin[:, n]] == all_ones)
	162 else:
	163 84.2 MiB 2.2 MiB _bools = np.all(X[:, combin], axis=2)
	164
	165 84.2 MiB 0.0 MiB support = _support(np.array(_bools), rows_count, is_sparse)
	166 84.2 MiB 0.0 MiB _mask = (support >= min_support).reshape(-1)
	167
	168 84.2 MiB 0.0 MiB if any(_mask):
	169 84.2 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(combin[_mask])
	170 84.2 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(support[_mask])
	171 84.2 MiB 0.0 MiB max_itemset = next_max_itemset
	172 else:
	173 83.8 MiB 0.0 MiB break
	174 83.8 MiB 0.0 MiB all_res = []
	175 83.8 MiB 0.0 MiB for k in sorted(itemset_dict):
	176 83.8 MiB 0.0 MiB support = pd.Series(support_dict[k])
	177 83.8 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
	178
	179 83.8 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
	180 83.8 MiB 0.0 MiB all_res.append(res)
	181
	182 83.9 MiB 0.0 MiB res_df = pd.concat(all_res)
	183 83.9 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
	184 83.9 MiB 0.0 MiB if use_colnames:
	185 mapping = {idx: item for idx, item in enumerate(df.columns)}
	186 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
	187 mapping[i] for i in x]))
	188 83.9 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
	189
	190 83.9 MiB 0.0 MiB if verbose:
	191 print() # adds newline if verbose counter was used
	192
	193 83.9 MiB 0.0 MiB return res_df
	# Smaller dataset, new implementation, dense dataframe

	import pandas as pd
	df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
	from mlxtend.preprocessing import TransactionEncoder
	te = TransactionEncoder()
	_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
	items = pd.DataFrame(_items, columns=te.columns_)
	from timeit import repeat
	from mlxtend.frequent_patterns.apriori import apriori
	repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
	Out[2]:
	[0.12199777899999997,
	0.11085488300000002,
	0.12048885899999995,
	0.12400905900000003,
	0.1155322019999998,
	0.11182724500000019,
	0.12058627099999963,
	0.11633145700000025,
	0.11280813100000042,
	0.11844944400000035]
	# Smaller dataset, new implementation, sparse dataframe

	import pandas as pd
	df = pd.read_csv('/Projects/drink_recommendation_engine/sample.csv', sep='\t')
	from mlxtend.preprocessing import TransactionEncoder
	te = TransactionEncoder()
	_items = te.fit_transform(df.groupby(['ticket_id'])['item_name'].apply(lambda x: tuple(x)))
	items = pd.SparseDataFrame(_items, columns=te.columns_, default_fill_value=0)
	from timeit import repeat
	from mlxtend.frequent_patterns.apriori import apriori
	repeat("apriori(items, min_support=0.05)", globals=globals(), number=10, repeat=10)
	Out[2]:
	[0.5350212700000001,
	0.5399577099999999,
	0.5390251770000001,
	0.5814595669999996,
	0.5923116000000004,
	0.5686850980000004,
	0.5839204499999999,
	0.4983124719999994,
	0.5384785540000001,
	0.5440615109999998]
	# Run 4, larger dataset, new implementation

	Line # Mem usage Increment Line Contents
	================================================
	55 95.7 MiB 95.7 MiB @profile
	56 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
	57 """Get frequent itemsets from a one-hot DataFrame
	58 Parameters
	59 -----------
	60 df : pandas DataFrame or pandas SparseDataFrame
	61 pandas DataFrame the encoded format.
	62 The allowed values are either 0/1 or True/False.
	63 For example,
	64
	65 ```
	66 Apple Bananas Beer Chicken Milk Rice
	67 0 1 0 1 1 0 1
	68 1 1 0 1 0 0 1
	69 2 1 0 1 0 0 0
	70 3 1 1 0 0 0 0
	71 4 0 0 1 1 1 1
	72 5 0 0 1 0 1 1
	73 6 0 0 1 0 1 0
	74 7 1 1 0 0 0 0
	75 ```
	76
	77 min_support : float (default: 0.5)
	78 A float between 0 and 1 for minumum support of the itemsets returned.
	79 The support is computed as the fraction
	80 transactions_where_item(s)_occur / total_transactions.
	81
	82 use_colnames : bool (default: False)
	83 If true, uses the DataFrames' column names in the returned DataFrame
	84 instead of column indices.
	85
	86 max_len : int (default: None)
	87 Maximum length of the itemsets generated. If `None` (default) all
	88 possible itemsets lengths (under the apriori condition) are evaluated.
	89
	90 verbose : int (default: 0)
	91 Shows the number of iterations if 1.
	92
	93 Returns
	94 -----------
	95 pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
	96 that are >= `min_support` and < than `max_len`
	97 (if `max_len` is not None).
	98 Each itemset in the 'itemsets' column is of type `frozenset`,
	99 which is a Python built-in type that behaves similarly to
	100 sets except that it is immutable
	101 (For more info, see
	102 https://docs.python.org/3.6/library/stdtypes.html#frozenset).
	103
	104 Examples
	105 -----------
	106 For usage examples, please see
	107 http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
	108
	109 """
	110
	111 118.2 MiB 2.9 MiB def _support(_x, _n_rows, _is_sparse):
	112 """
	113 DRY local method to calculate support as the row-wise sum of values / number of rows
	114 :param _x: matrix of bools or binary
	115 :param _n_rows: numeric, number of rows
	116 :param _is_sparse: bool True if _x is sparse
	117 :return: np.array, shape = (n_rows, )
	118 """
	119 118.2 MiB 0.0 MiB out = (np.sum(_x, axis=0) / _n_rows)
	120 118.2 MiB 0.0 MiB return np.array(out).reshape(-1)
	121
	122 112.4 MiB 16.7 MiB idxs = np.where((df.values != 1) & (df.values != 0))
	123 112.4 MiB 0.0 MiB if len(idxs[0]) > 0:
	124 val = df.values[idxs[0][0], idxs[1][0]]
	125 s = ('The allowed values for a DataFrame'
	126 ' are True, False, 0, 1. Found value %s' % (val))
	127 raise ValueError(s)
	128
	129 112.4 MiB 0.0 MiB is_sparse = hasattr(df, "to_coo")
	130 112.4 MiB 0.0 MiB if is_sparse:
	131 if not isinstance(df.columns[0], str) and df.columns[0] != 0:
	132 raise ValueError('Due to current limitations in Pandas, '
	133 'if the SparseDataFrame has integer column names,'
	134 'names, please make sure they either start '
	135 'with `0` or cast them as string column names: '
	136 '`df.columns = [str(i) for i in df.columns`].')
	137
	138 X = df.to_coo().tocsc()
	139 support = _support(X, X.shape[0], is_sparse)
	140 else:
	141 112.4 MiB 0.0 MiB X = df.values
	142 112.4 MiB 0.0 MiB support = _support(X, X.shape[0], is_sparse)
	143
	144 112.4 MiB 0.0 MiB ary_col_idx = np.arange(X.shape[1])
	145 112.4 MiB 0.0 MiB support_dict = {1: support[support >= min_support]}
	146 112.4 MiB 0.0 MiB itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
	147 112.4 MiB 0.0 MiB max_itemset = 1
	148 112.4 MiB 0.0 MiB rows_count = float(X.shape[0])
	149
	150 117.7 MiB 0.0 MiB while max_itemset and max_itemset < (max_len or float('inf')):
	151 117.7 MiB 0.0 MiB next_max_itemset = max_itemset + 1
	152 117.7 MiB 0.0 MiB combin = np.array(list(generate_new_combinations(itemset_dict[max_itemset])))
	153
	154 117.7 MiB 0.0 MiB if combin.size == 0:
	155 break
	156
	157 117.7 MiB 0.0 MiB if is_sparse:
	158 all_ones = np.ones((int(rows_count), 1))
	159 _bools = X[:, combin[:, 0]] == all_ones
	160 for n in range(1, combin.shape[1]):
	161 _bools = _bools & (X[:, combin[:, n]] == all_ones)
	162 else:
	163 117.8 MiB 2.9 MiB _bools = np.all(X[:, combin], axis=2)
	164
	165 117.8 MiB 0.0 MiB support = _support(np.array(_bools), rows_count, is_sparse)
	166 117.8 MiB 0.0 MiB _mask = (support >= min_support).reshape(-1)
	167
	168 117.8 MiB 0.0 MiB if any(_mask):
	169 117.7 MiB 0.0 MiB itemset_dict[next_max_itemset] = np.array(combin[_mask])
	170 117.7 MiB 0.0 MiB support_dict[next_max_itemset] = np.array(support[_mask])
	171 117.7 MiB 0.0 MiB max_itemset = next_max_itemset
	172 else:
	173 117.8 MiB 0.0 MiB break
	174 117.8 MiB 0.0 MiB all_res = []
	175 117.9 MiB 0.0 MiB for k in sorted(itemset_dict):
	176 117.9 MiB 0.0 MiB support = pd.Series(support_dict[k])
	177 117.9 MiB 0.0 MiB itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]])
	178
	179 117.9 MiB 0.0 MiB res = pd.concat((support, itemsets), axis=1)
	180 117.9 MiB 0.0 MiB all_res.append(res)
	181
	182 117.9 MiB 0.0 MiB res_df = pd.concat(all_res)
	183 117.9 MiB 0.0 MiB res_df.columns = ['support', 'itemsets']
	184 117.9 MiB 0.0 MiB if use_colnames:
	185 mapping = {idx: item for idx, item in enumerate(df.columns)}
	186 res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
	187 mapping[i] for i in x]))
	188 117.9 MiB 0.0 MiB res_df = res_df.reset_index(drop=True)
	189
	190 117.9 MiB 0.0 MiB if verbose:
	191 print() # adds newline if verbose counter was used
	192
	193 117.9 MiB 0.0 MiB return res_df