Skip to content

Instantly share code, notes, and snippets.

@jonas-kell
Created September 30, 2023 15:55
Show Gist options
  • Save jonas-kell/7f95a9535d14881dfe1a6582386ea383 to your computer and use it in GitHub Desktop.
Save jonas-kell/7f95a9535d14881dfe1a6582386ea383 to your computer and use it in GitHub Desktop.
Code of the Mathezirkel "Clustering und Wort-Einbettungen"
"""
INSTALLATION REQUIREMENTS:
- Download the datase from https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300 (you need to make an account)
- Copy the code into a file named words.py
- Install Python: https://www.python.org/
- Install gensim (pip install gensim)
- Make a folder called bin inside the folder where words.py is saved and place GoogleNews-vectors-negative300.bin inside
"""
from gensim.models import KeyedVectors
# Load the pre-trained Word2Vec model
model_path = "./bin/GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True)
def similar(word):
print(f"Most similar to {word}: {model.similar_by_word(word)}")
def distance(word1, word2):
print(f"{word1} <-> {word2}: {model.distance(word1, word2)}")
def transfer_relation(start, minus, plus):
answers = model.most_similar_cosmul(
positive=[start, plus], negative=[minus], topn=5
)
print(f"{start} - {minus} + {plus} = {answers[0][0]}")
print(answers)
print("")
# You can check out what is most similar to...
similar("car")
# You can check out the distance of words (cat and dog are more closely related than cat and car)
distance("cat", "car")
distance("cat", "dog")
# You can check out combinations of words-meaning-directions
transfer_relation("ocean", "water", "sand") # dunes
transfer_relation("dog", "bark", "miaow") # chihuahua, but kitten actually really close
transfer_relation("Italy", "pizza", "sushi") # Japan
transfer_relation("butcher", "meat", "bread") # baker
transfer_relation("car", "road", "river") # boat
transfer_relation("Germany", "Berlin", "Paris") # France
transfer_relation("Germany", "Berlin", "Tokyo") # Japan
transfer_relation("Germany", "Berlin", "Beijing") # China
transfer_relation("Germany", "Berlin", "Paris") # France
transfer_relation("Germany", "Berlin", "Lisbon") # Portugal
transfer_relation("Germany", "Berlin", "Sofia") # Bulgaria
transfer_relation("Germany", "Berlin", "London") # UK
# But it can also not work really well sometimes
# sadly not ice, here "good" comes up, because "solid" is a synonym for good
transfer_relation("water", "liquid", "solid")
# no idea what happens here, but apparently things that burn ...
transfer_relation("fire", "heat", "cold")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment