Created
February 15, 2018 15:43
-
-
Save AbhishekAshokDubey/054af6f92d67d5ef8300fac58f59fcc9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 05 04:12:36 2017 | |
@author: ADubey4 | |
""" | |
from __future__ import unicode_literals, print_function | |
import gensim | |
from gensim.parsing import PorterStemmer | |
from spacy.en import English | |
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors | |
from nltk.corpus import stopwords | |
from nltk import word_tokenize | |
from nltk import tokenize | |
import string | |
import re | |
import os | |
from itertools import chain | |
# nltk.download('wordnet') | |
# nltk.download('averaged_perceptron_tagger') | |
# nltk.download('stopwords') | |
# nltk.download('punkt') | |
# global initialization | |
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
stemmer = PorterStemmer() | |
stopwords = stopwords.words('english') | |
nlp = English() #nlp = spacy.load("en") | |
data_dir_path = r"/home/adubey4/word2vec/text_data" | |
base_dir = os.path.dirname(data_dir_path) | |
os.chdir(base_dir) | |
min_count = 5 | |
class Stemming(object): | |
word_lookup = {} | |
@classmethod | |
def stem(cls, word): | |
stemmed = stemmer.stem(word) | |
if stemmed not in cls.word_lookup: | |
cls.word_lookup[stemmed] = {} | |
cls.word_lookup[stemmed][word] = ( | |
cls.word_lookup[stemmed].get(word, 0) + 1) | |
return stemmed | |
@classmethod | |
def original_form(cls, word): | |
if word in cls.word_lookup: | |
return max(cls.word_lookup[word].keys(), | |
key=lambda x: cls.word_lookup[word][x]) | |
else: | |
return word | |
#tokenize.sent_tokenize(doc.lower()) | |
class SentenceClass(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname), 'r'): | |
word_list = re.sub("[^A-Za-z]", " ",line.lower()).split() | |
if len(word_list)> 2: | |
yield word_list | |
# with open(os.path.join(self.dirname,fname), 'r') as myfile: | |
# doc = myfile.read().replace('\n', ' ') | |
# for sent in tokenize.sent_tokenize(doc.lower()): | |
# yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\ | |
# if word not in stopwords] | |
my_sentences = SentenceClass(data_dir_path) | |
#check | |
#print(sum(1 for _ in my_sentences)) | |
#print(sum(1 for _ in my_sentences)) | |
#print(sum(1 for _ in my_sentences)) | |
google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) | |
model = Word2Vec(size=300, min_count=min_count, iter=10) | |
model.build_vocab(my_sentences) | |
training_examples_count = model.corpus_count | |
# below line will make it 1, so saving it before | |
model.build_vocab([list(google_wv.vocab.keys())], update=True) | |
model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0) | |
model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter) | |
model.save("word2vec_model2") | |
#model1 = Word2Vec.load("word2vec_model") | |
model.wv.save("word2vec_model_vectors2") | |
#word_vectors1 = KeyedVectors.load("word2vec_model_vectors") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! I created my model with my own data, and then intersect it with the pretrained w2v model. After running code at line 86-88, the vocabulary size of the my model hasnt changed, when w2v model has 3M words. Is this an expected behavior? Do we consider this as transfer learning?