Skip to content

Instantly share code, notes, and snippets.

from gensim.models.doc2vec import TaggedDocument
EMBEDDING_DIM = 200 # dimensionality of user representation
class TaggedDocumentIterator(object):
def __iter__(self):
for row in self.df.itertuples():
yield TaggedDocument(
words=dict(row._asdict())['all_orders'].split(),
tags=[dict(row._asdict())['user_id']])
orders.sort_values(by=['user_id','order_number','add_to_cart_order'],
inplace=True)
orders_by_uid = orders.groupby("user_id")
.apply(lambda order: ' '.join(order['product_id'].tolist()))
orders_by_uid = pd.DataFrame(orders_by_uid,
columns=['all_orders'])
orders_by_uid.reset_index(inplace=True)
orders_by_uid.user_id = orders_by_uid.user_id.astype(str)
from sklearn.preprocessing import MinMaxScaler
mm_scale = MinMaxScaler()
# feature_df is the dataframe with customer features
feature_df_scale = pd.DataFrame(mm_scale.fit_transform(feature_df),
columns=feature_df.columns,
index=feature_df.index.values)
tsne_doc_features = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=500)
tsne_features_doc = tsne_doc_features.fit_transform(feature_df_scale.values)
from sklearn.metrics import silhouette_samples, silhouette_score
for space_name, space in {'t-SNE': tsne_results,
'original': model.wv.vectors}.items():
for entity in ['department', 'aisle']:
s = silhouette_score(space, df_semantic_item[entity], metric="cosine")
print(f"Score on {space_name} space for {entity}s is {s:.4}")
--------------
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
# prepare inputs for t-SNE
word_vectors = model.wv
vocab = list(model.wv.vocab.keys())
item2vector_dict = {arg:model.wv[arg] for arg in vocab}
X = pd.DataFrame(item2vector_dict).T.values
# perform t-SNE
def to_product_name(id, columns='product_name'):
Return products_csv[products_csv.product_id==id][columns].values.tolist()[0]
def most_similar_readable(model, product_id, topn=10):
similar_list = [(product_id, 1.0)] + model.wv.most_similar(str(product_id),
topn=topn)
return pd.DataFrame([( to_product_name(int(id)), int(id), similarity ) for
(id, similarity) in similar_list],
columns=['product', 'product_id', 'similarity'])
from gensim.models import Word2Vec
import multiprocessing as mp
WORD_DIM = 200 # dimensionality of the embedding space
model = Word2Vec(product_corpus,
window=5,
size=WORD_DIM,
workers=mp.cpu_count() - 2,
min_count=100)
order_ds = orders_csv.merge(order_products_csv,
left_on='order_id',
right_index=True)
# Creating sequences based on transactions
order_product_list = order_ds.sort_values(
['user_id','order_id','add_to_cart_order'])
[['order_id','product_id']].values.tolist()
# Each entry of a corpus is one order represented by
products.csv
| product_id | product_name | aisle_id | department_id |
| 1 | Chocolate Sandwich Cookies | 61 | 19 |
| 2 | All-Seasons Salt | 104 | 13 |
| 3 | Robust Golden Oolong Tea | 94 | 7 |
...
departments.csv # coarse categorization
| department_id | department |
| 1 | frozen |
config = ddpg.DEFAULT_CONFIG.copy()
config["actor_hiddens"] = [512, 512]
config["critic_hiddens"] = [512, 512]
config["gamma"] = 0.95
config["timesteps_per_iteration"] = 1000
config["target_network_update_freq"] = 5
config["buffer_size"] = 10000
trainer = ddpg.DDPGTrainer(config=config, env=SimpleSupplyChain)
for i in range(n_iterations):