Skip to content

Instantly share code, notes, and snippets.

@AbhishekAshokDubey
Created February 15, 2018 15:43
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save AbhishekAshokDubey/054af6f92d67d5ef8300fac58f59fcc9 to your computer and use it in GitHub Desktop.
Save AbhishekAshokDubey/054af6f92d67d5ef8300fac58f59fcc9 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Fri May 05 04:12:36 2017
@author: ADubey4
"""
from __future__ import unicode_literals, print_function
import gensim
from gensim.parsing import PorterStemmer
from spacy.en import English
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import tokenize
import string
import re
import os
from itertools import chain
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('punkt')
# global initialization
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = PorterStemmer()
stopwords = stopwords.words('english')
nlp = English() #nlp = spacy.load("en")
data_dir_path = r"/home/adubey4/word2vec/text_data"
base_dir = os.path.dirname(data_dir_path)
os.chdir(base_dir)
min_count = 5
class Stemming(object):
word_lookup = {}
@classmethod
def stem(cls, word):
stemmed = stemmer.stem(word)
if stemmed not in cls.word_lookup:
cls.word_lookup[stemmed] = {}
cls.word_lookup[stemmed][word] = (
cls.word_lookup[stemmed].get(word, 0) + 1)
return stemmed
@classmethod
def original_form(cls, word):
if word in cls.word_lookup:
return max(cls.word_lookup[word].keys(),
key=lambda x: cls.word_lookup[word][x])
else:
return word
#tokenize.sent_tokenize(doc.lower())
class SentenceClass(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname), 'r'):
word_list = re.sub("[^A-Za-z]", " ",line.lower()).split()
if len(word_list)> 2:
yield word_list
# with open(os.path.join(self.dirname,fname), 'r') as myfile:
# doc = myfile.read().replace('\n', ' ')
# for sent in tokenize.sent_tokenize(doc.lower()):
# yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
# if word not in stopwords]
my_sentences = SentenceClass(data_dir_path)
#check
#print(sum(1 for _ in my_sentences))
#print(sum(1 for _ in my_sentences))
#print(sum(1 for _ in my_sentences))
google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model = Word2Vec(size=300, min_count=min_count, iter=10)
model.build_vocab(my_sentences)
training_examples_count = model.corpus_count
# below line will make it 1, so saving it before
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter)
model.save("word2vec_model2")
#model1 = Word2Vec.load("word2vec_model")
model.wv.save("word2vec_model_vectors2")
#word_vectors1 = KeyedVectors.load("word2vec_model_vectors")
@RitiP
Copy link

RitiP commented Oct 13, 2018

Hi,

Does this code allow me to train the exiting google word2vec model with some new training data such as CUB dataset? I'm trying it out. Asking to get a clarification.

Regards,
Riti

@RitiP
Copy link

RitiP commented Oct 13, 2018

import os
import gensim

def show_file_contents(filename):

with open(filename, 'rb') as f:
    for i, line in enumerate(f):
        #no_line = []
        line = line.split(".")[1]
        #no_line = (x for x in line.split('_'))
        #for x in no_line:
        #   print x
        print(line)

def read_classes(filename):

class_list = []

with open(filename, 'rb') as f:
    for i, line in enumerate(f):
        line = line.split(".")[1]
        class_list.append(line)
        
return class_list

if name == "main":

show_file_contents(os.path.abspath("classes.txt"))

classes = read_classes(os.path.abspath("classes.txt"))

google_wv = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

model = gensim.models.Word2Vec(
    size=300,
    window=5,
    min_count=0,
    workers=4)
model.build_vocab([classes])
training_examples_count = model.corpus_count
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("./model/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.train([classes],total_examples=training_examples_count, epochs=model.iter)

Code for your reference.

@raman-r-4978
Copy link

Hi RitiP,

how long your new model takes it to train for new words? (maybe for the 1MB text file with new vocabulary)

@mansiganatra
Copy link

Are both the below lines necessary?

model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("./model/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)

model.build_vocab([list(google_wv.vocab.keys())], update=True) will update the vocabulary to the base model's instead of the current one and model.intersect_word2vec_format("./model/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0) will intersect it again with the base model's vocab. What is the purpose of having both?

@saharghannay
Copy link

How can I fine tune a gensim wor2dev model
I have a gensim model trained on wiki data, and I would like to fine tune it on in new domain data.
I tired with this code but I had problem with intersect_word2vec_format function

model = KeyedVectors.load(args.pretrained_model)
model_cbow = Word2Vec(size=300, min_count=1)
model_cbow.build_vocab(sentences)
total_examples = model_cbow.corpus_count
model_cbow.build_vocab([list(model.wv.vocab.keys())], update=True)
model_cbow.intersect_word2vec_format(args.pretrained_model, binary=True)
model_cbow=Word2Vec(sentences, size=args.vector_size,window=5, min_count=args.min_count, workers=8, iter=5)

do you have an idea how can fix this problem, please

@cyberosa
Copy link

Hi, I am trying to do a similar thing so I am trying to understand all the transformations you did first. Why are you importing the stopwords in English but the code using it is commented? You don´t need to remove the stopwords before applying your sentences to the model?

Many thanks for sharing

@zuriatib
Copy link

Hi! I created my model with my own data, and then intersect it with the pretrained w2v model. After running code at line 86-88, the vocabulary size of the my model hasnt changed, when w2v model has 3M words. Is this an expected behavior? Do we consider this as transfer learning?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment