# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 05 04:12:36 2017 | |
@author: ADubey4 | |
""" | |
from __future__ import unicode_literals, print_function | |
import gensim | |
from gensim.parsing import PorterStemmer | |
from spacy.en import English | |
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors | |
from nltk.corpus import stopwords | |
from nltk import word_tokenize | |
from nltk import tokenize | |
import string | |
import re | |
import os | |
from itertools import chain | |
# nltk.download('wordnet') | |
# nltk.download('averaged_perceptron_tagger') | |
# nltk.download('stopwords') | |
# nltk.download('punkt') | |
# global initialization | |
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
stemmer = PorterStemmer() | |
stopwords = stopwords.words('english') | |
nlp = English() #nlp = spacy.load("en") | |
data_dir_path = r"/home/adubey4/word2vec/text_data" | |
base_dir = os.path.dirname(data_dir_path) | |
os.chdir(base_dir) | |
min_count = 5 | |
class Stemming(object): | |
word_lookup = {} | |
@classmethod | |
def stem(cls, word): | |
stemmed = stemmer.stem(word) | |
if stemmed not in cls.word_lookup: | |
cls.word_lookup[stemmed] = {} | |
cls.word_lookup[stemmed][word] = ( | |
cls.word_lookup[stemmed].get(word, 0) + 1) | |
return stemmed | |
@classmethod | |
def original_form(cls, word): | |
if word in cls.word_lookup: | |
return max(cls.word_lookup[word].keys(), | |
key=lambda x: cls.word_lookup[word][x]) | |
else: | |
return word | |
#tokenize.sent_tokenize(doc.lower()) | |
class SentenceClass(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname), 'r'): | |
word_list = re.sub("[^A-Za-z]", " ",line.lower()).split() | |
if len(word_list)> 2: | |
yield word_list | |
# with open(os.path.join(self.dirname,fname), 'r') as myfile: | |
# doc = myfile.read().replace('\n', ' ') | |
# for sent in tokenize.sent_tokenize(doc.lower()): | |
# yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\ | |
# if word not in stopwords] | |
my_sentences = SentenceClass(data_dir_path) | |
#check | |
#print(sum(1 for _ in my_sentences)) | |
#print(sum(1 for _ in my_sentences)) | |
#print(sum(1 for _ in my_sentences)) | |
google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) | |
model = Word2Vec(size=300, min_count=min_count, iter=10) | |
model.build_vocab(my_sentences) | |
training_examples_count = model.corpus_count | |
# below line will make it 1, so saving it before | |
model.build_vocab([list(google_wv.vocab.keys())], update=True) | |
model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0) | |
model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter) | |
model.save("word2vec_model2") | |
#model1 = Word2Vec.load("word2vec_model") | |
model.wv.save("word2vec_model_vectors2") | |
#word_vectors1 = KeyedVectors.load("word2vec_model_vectors") |
import os
import gensim
def show_file_contents(filename):
with open(filename, 'rb') as f:
for i, line in enumerate(f):
#no_line = []
line = line.split(".")[1]
#no_line = (x for x in line.split('_'))
#for x in no_line:
# print x
print(line)
def read_classes(filename):
class_list = []
with open(filename, 'rb') as f:
for i, line in enumerate(f):
line = line.split(".")[1]
class_list.append(line)
return class_list
if name == "main":
show_file_contents(os.path.abspath("classes.txt"))
classes = read_classes(os.path.abspath("classes.txt"))
google_wv = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)
model = gensim.models.Word2Vec(
size=300,
window=5,
min_count=0,
workers=4)
model.build_vocab([classes])
training_examples_count = model.corpus_count
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("./model/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.train([classes],total_examples=training_examples_count, epochs=model.iter)
Code for your reference.
Hi RitiP,
how long your new model takes it to train for new words? (maybe for the 1MB text file with new vocabulary)
Are both the below lines necessary?
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("./model/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.build_vocab([list(google_wv.vocab.keys())], update=True)
will update the vocabulary to the base model's instead of the current one and model.intersect_word2vec_format("./model/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
will intersect it again with the base model's vocab. What is the purpose of having both?
How can I fine tune a gensim wor2dev model
I have a gensim model trained on wiki data, and I would like to fine tune it on in new domain data.
I tired with this code but I had problem with intersect_word2vec_format function
model = KeyedVectors.load(args.pretrained_model)
model_cbow = Word2Vec(size=300, min_count=1)
model_cbow.build_vocab(sentences)
total_examples = model_cbow.corpus_count
model_cbow.build_vocab([list(model.wv.vocab.keys())], update=True)
model_cbow.intersect_word2vec_format(args.pretrained_model, binary=True)
model_cbow=Word2Vec(sentences, size=args.vector_size,window=5, min_count=args.min_count, workers=8, iter=5)
do you have an idea how can fix this problem, please
Hi, I am trying to do a similar thing so I am trying to understand all the transformations you did first. Why are you importing the stopwords in English but the code using it is commented? You don´t need to remove the stopwords before applying your sentences to the model?
Many thanks for sharing
Hi! I created my model with my own data, and then intersect it with the pretrained w2v model. After running code at line 86-88, the vocabulary size of the my model hasnt changed, when w2v model has 3M words. Is this an expected behavior? Do we consider this as transfer learning?
Hi,
Does this code allow me to train the exiting google word2vec model with some new training data such as CUB dataset? I'm trying it out. Asking to get a clarification.
Regards,
Riti