Skip to content

Instantly share code, notes, and snippets.

@IshitaTakeshi
Created September 8, 2017 15:47
Show Gist options
  • Save IshitaTakeshi/ea5eeca04851158a5db77e2f7e135321 to your computer and use it in GitHub Desktop.
Save IshitaTakeshi/ea5eeca04851158a5db77e2f7e135321 to your computer and use it in GitHub Desktop.
Innovation Project
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, DocvecsArray
# root = tree.getroot()
# for neighbor in root.iter("neighbor"):
# print(neighbor)
MIN_LINE_LENGTH = 80
def generate_documents():
with open("./fiwiki-20140809-corpus.xml", "r") as f:
xmlstring = f.read()
soup = BeautifulSoup(xmlstring, "xml")
documents = []
i = 0
for line in soup.find_all("p"):
line = line.text
if len(line) < MIN_LINE_LENGTH:
continue
words = line.replace("\n", " ").split(" ")
document = TaggedDocument(words=words, tags=[str(i)])
documents.append(document)
i += 1
return documents
if False:
documents = generate_documents()
model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4)
model.save("model")
else:
model = Doc2Vec.load("model")
similarity = model.docvecs.similarity_unseen_docs(
model,
"kiitos kiitos moi".split(" "),
"moi moi".split(" ")
)
print(similarity)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment