Last active
October 14, 2017 08:22
-
-
Save int128/181c075ce966022ae1d129e450dd3051 to your computer and use it in GitHub Desktop.
Example of Doc2Vec and Google Cloud Natural Language API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models.doc2vec import Doc2Vec | |
from gensim.models.doc2vec import TaggedDocument | |
from google.cloud import language | |
def wakachigaki(text): | |
client = language.Client() | |
document = client.document_from_text(text) | |
annotations = document.annotate_text() | |
words = [] | |
for token in annotations.tokens: | |
words.append(token.text_content) | |
return words | |
training_docs = [ | |
TaggedDocument(words=wakachigaki(u"This is a pen."), tags=['d1']), | |
TaggedDocument(words=wakachigaki(u"This is an apple."), tags=['d2']), | |
TaggedDocument(words=wakachigaki(u"This is a pineapple."), tags=['d3']), | |
TaggedDocument(words=wakachigaki(u"That is a pen."), tags=['d4']), | |
] | |
model = Doc2Vec(documents=training_docs, min_count=1, dm=0) | |
print(model.docvecs['d1']) | |
print(model.docvecs.most_similar('d1')) | |
print(model.docvecs.most_similar('d2')) | |
print(model.docvecs.most_similar('d3')) | |
print(model.docvecs.most_similar('d4')) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment