Skip to content

Instantly share code, notes, and snippets.

@huyhoang17
Forked from AbhishekAshokDubey/word2vec.py
Created April 6, 2019 15:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save huyhoang17/33d41aa278828bcacf0b42045760d2ee to your computer and use it in GitHub Desktop.
Save huyhoang17/33d41aa278828bcacf0b42045760d2ee to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Fri May 05 04:12:36 2017
@author: ADubey4
"""
from __future__ import unicode_literals, print_function
import gensim
from gensim.parsing import PorterStemmer
from spacy.en import English
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import tokenize
import string
import re
import os
from itertools import chain
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('punkt')
# global initialization
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = PorterStemmer()
stopwords = stopwords.words('english')
nlp = English() #nlp = spacy.load("en")
data_dir_path = r"/home/adubey4/word2vec/text_data"
base_dir = os.path.dirname(data_dir_path)
os.chdir(base_dir)
min_count = 5
class Stemming(object):
word_lookup = {}
@classmethod
def stem(cls, word):
stemmed = stemmer.stem(word)
if stemmed not in cls.word_lookup:
cls.word_lookup[stemmed] = {}
cls.word_lookup[stemmed][word] = (
cls.word_lookup[stemmed].get(word, 0) + 1)
return stemmed
@classmethod
def original_form(cls, word):
if word in cls.word_lookup:
return max(cls.word_lookup[word].keys(),
key=lambda x: cls.word_lookup[word][x])
else:
return word
#tokenize.sent_tokenize(doc.lower())
class SentenceClass(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname), 'r'):
word_list = re.sub("[^A-Za-z]", " ",line.lower()).split()
if len(word_list)> 2:
yield word_list
# with open(os.path.join(self.dirname,fname), 'r') as myfile:
# doc = myfile.read().replace('\n', ' ')
# for sent in tokenize.sent_tokenize(doc.lower()):
# yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
# if word not in stopwords]
my_sentences = SentenceClass(data_dir_path)
#check
#print(sum(1 for _ in my_sentences))
#print(sum(1 for _ in my_sentences))
#print(sum(1 for _ in my_sentences))
google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model = Word2Vec(size=300, min_count=min_count, iter=10)
model.build_vocab(my_sentences)
training_examples_count = model.corpus_count
# below line will make it 1, so saving it before
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter)
model.save("word2vec_model2")
#model1 = Word2Vec.load("word2vec_model")
model.wv.save("word2vec_model_vectors2")
#word_vectors1 = KeyedVectors.load("word2vec_model_vectors")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment