huyhoang17/word2vec.py

## word2vec.py
# -*- coding: utf-8 -*-
"""
Created on Fri May 05 04:12:36 2017

@author: ADubey4
"""

from __future__ import unicode_literals, print_function
import gensim
from gensim.parsing import PorterStemmer
from spacy.en import English
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import tokenize
import string
import re
import os
from itertools import chain

# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('punkt')

# global initialization
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = PorterStemmer()
stopwords = stopwords.words('english')
nlp = English() #nlp = spacy.load("en")
data_dir_path = r"/home/adubey4/word2vec/text_data"
base_dir = os.path.dirname(data_dir_path)
os.chdir(base_dir)
min_count = 5

class Stemming(object):
    word_lookup = {}

    @classmethod
    def stem(cls, word):
        stemmed = stemmer.stem(word)
        if stemmed not in cls.word_lookup:
            cls.word_lookup[stemmed] = {}
        cls.word_lookup[stemmed][word] = (
            cls.word_lookup[stemmed].get(word, 0) + 1)
        return stemmed

    @classmethod
    def original_form(cls, word):
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                       key=lambda x: cls.word_lookup[word][x])
        else:
            return word

#tokenize.sent_tokenize(doc.lower())
class SentenceClass(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), 'r'):
                word_list = re.sub("[^A-Za-z]", " ",line.lower()).split()
                if len(word_list)> 2:
                    yield word_list
#            with open(os.path.join(self.dirname,fname), 'r') as myfile:
#                doc = myfile.read().replace('\n', ' ')
#                for sent in tokenize.sent_tokenize(doc.lower()):
#                    yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
#                    if word not in stopwords]

my_sentences = SentenceClass(data_dir_path)

#check
#print(sum(1 for _ in my_sentences))
#print(sum(1 for _ in my_sentences))
#print(sum(1 for _ in my_sentences))

google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

model = Word2Vec(size=300, min_count=min_count, iter=10)
model.build_vocab(my_sentences)
training_examples_count = model.corpus_count
# below line will make it 1, so saving it before
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter)


model.save("word2vec_model2")
#model1 = Word2Vec.load("word2vec_model")
model.wv.save("word2vec_model_vectors2")
#word_vectors1 = KeyedVectors.load("word2vec_model_vectors")
	# -- coding: utf-8 --
	"""
	Created on Fri May 05 04:12:36 2017

	@author: ADubey4
	"""

	from __future__ import unicode_literals, print_function
	import gensim
	from gensim.parsing import PorterStemmer
	from spacy.en import English
	from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
	from nltk.corpus import stopwords
	from nltk import word_tokenize
	from nltk import tokenize
	import string
	import re
	import os
	from itertools import chain

	# nltk.download('wordnet')
	# nltk.download('averaged_perceptron_tagger')
	# nltk.download('stopwords')
	# nltk.download('punkt')

	# global initialization
	#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	stemmer = PorterStemmer()
	stopwords = stopwords.words('english')
	nlp = English() #nlp = spacy.load("en")
	data_dir_path = r"/home/adubey4/word2vec/text_data"
	base_dir = os.path.dirname(data_dir_path)
	os.chdir(base_dir)
	min_count = 5

	class Stemming(object):
	word_lookup = {}

	@classmethod
	def stem(cls, word):
	stemmed = stemmer.stem(word)
	if stemmed not in cls.word_lookup:
	cls.word_lookup[stemmed] = {}
	cls.word_lookup[stemmed][word] = (
	cls.word_lookup[stemmed].get(word, 0) + 1)
	return stemmed

	@classmethod
	def original_form(cls, word):
	if word in cls.word_lookup:
	return max(cls.word_lookup[word].keys(),
	key=lambda x: cls.word_lookup[word][x])
	else:
	return word

	#tokenize.sent_tokenize(doc.lower())
	class SentenceClass(object):
	def __init__(self, dirname):
	self.dirname = dirname

	def __iter__(self):
	for fname in os.listdir(self.dirname):
	for line in open(os.path.join(self.dirname, fname), 'r'):
	word_list = re.sub("[^A-Za-z]", " ",line.lower()).split()
	if len(word_list)> 2:
	yield word_list
	# with open(os.path.join(self.dirname,fname), 'r') as myfile:
	# doc = myfile.read().replace('\n', ' ')
	# for sent in tokenize.sent_tokenize(doc.lower()):
	# yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
	# if word not in stopwords]

	my_sentences = SentenceClass(data_dir_path)

	#check
	#print(sum(1 for _ in my_sentences))
	#print(sum(1 for _ in my_sentences))
	#print(sum(1 for _ in my_sentences))

	google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

	model = Word2Vec(size=300, min_count=min_count, iter=10)
	model.build_vocab(my_sentences)
	training_examples_count = model.corpus_count
	# below line will make it 1, so saving it before
	model.build_vocab([list(google_wv.vocab.keys())], update=True)
	model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
	model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter)


	model.save("word2vec_model2")
	#model1 = Word2Vec.load("word2vec_model")
	model.wv.save("word2vec_model_vectors2")
	#word_vectors1 = KeyedVectors.load("word2vec_model_vectors")