Skip to content

Instantly share code, notes, and snippets.

View eustin's full-sized avatar
😶‍🌫️

Justin Evans eustin

😶‍🌫️
View GitHub Profile
sum(true_prob_dist * np.log(true_prob_dist / predicted_prob_dist))
sum(true_prob_dist * np.log(true_prob_dist / true_prob_dist))
query_1 = "dog"
bing_search_results = [
"Dog - Wikipedia",
"Adopting a dog or puppy | RSPCA Australia",
"dog | History, Domestication, Physical Traits, & Breeds",
"New South Wales | Dogs & Puppies | Gumtree Australia Free",
"dog - Wiktionary"
]
relevance_grades = tf.constant([
[3.0, 2.0, 2.0, 2.0, 1.0],
[3.0, 3.0, 1.0, 1.0, 0.0]
])
combined_texts = [query_1, *bing_search_results, query_2, *google_search_results]
tokeniser = tf.keras.preprocessing.text.Tokenizer()
tokeniser.fit_on_texts(combined_texts)
# we add one here to account for the padding word
vocab_size = max(tokeniser.index_word) + 1
print(vocab_size)
for idx, word in tokeniser.index_word.items():
print(f"index {idx} - {word}")
EMBEDDING_DIMS = 2
embeddings = np.random.randn(vocab_size, EMBEDDING_DIMS).astype(np.float32)
print(embeddings)
query_1_embedding_index = tokeniser.texts_to_sequences([query_1])
query_1_embeddings = np.array([embeddings[x] for x in query_1_embedding_index])
print(query_1_embeddings)
query_2_embedding_indices = tokeniser.texts_to_sequences([query_2])
query_2_embeddings = np.array([embeddings[x] for x in query_2_embedding_indices])
print(query_2_embeddings)
query_2_embeddings_avg = tf.reduce_mean(query_2_embeddings, axis=1, keepdims=True).numpy()
print(query_2_embeddings_avg)