Skip to content

Instantly share code, notes, and snippets.

@heronyang
Created February 17, 2020 22:59
Show Gist options
  • Save heronyang/3d481011132824c9bd736914b6a40481 to your computer and use it in GitHub Desktop.
Save heronyang/3d481011132824c9bd736914b6a40481 to your computer and use it in GitHub Desktop.
"""
Loads a textfile, builds a Word2Vec model, and prints similarity of words.
"""
import urllib.request
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
# Source: Project Gutenberg's Alice's Adventures in Wonderland.
CORPUS_TEXT_URL = 'https://www.gutenberg.org/files/11/11.txt'
def main():
setup()
# Gets training data from corpus.
data = build_data(get_corpus())
# Trains the word2vec model.
model = Word2Vec(data, min_count=1, size=25, window=5, sg=1)
# Gets the most similar words with 'alice'.
print(model.similarity('alice', 'king'))
print(model.similarity('alice', 'tree'))
def setup():
nltk.download('punkt')
def get_corpus():
return ' '.join(
urllib.request.urlopen(CORPUS_TEXT_URL).read().decode("utf-8")
.replace('\n', ' ').split()
)
def build_data(corpus):
return [
[word.lower() for word in word_tokenize(sentence)]
for sentence in sent_tokenize(corpus)
]
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment