-
-
Save huyhoang17/33d41aa278828bcacf0b42045760d2ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 05 04:12:36 2017 | |
@author: ADubey4 | |
""" | |
from __future__ import unicode_literals, print_function | |
import gensim | |
from gensim.parsing import PorterStemmer | |
from spacy.en import English | |
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors | |
from nltk.corpus import stopwords | |
from nltk import word_tokenize | |
from nltk import tokenize | |
import string | |
import re | |
import os | |
from itertools import chain | |
# nltk.download('wordnet') | |
# nltk.download('averaged_perceptron_tagger') | |
# nltk.download('stopwords') | |
# nltk.download('punkt') | |
# global initialization | |
#sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
stemmer = PorterStemmer() | |
stopwords = stopwords.words('english') | |
nlp = English() #nlp = spacy.load("en") | |
data_dir_path = r"/home/adubey4/word2vec/text_data" | |
base_dir = os.path.dirname(data_dir_path) | |
os.chdir(base_dir) | |
min_count = 5 | |
class Stemming(object): | |
word_lookup = {} | |
@classmethod | |
def stem(cls, word): | |
stemmed = stemmer.stem(word) | |
if stemmed not in cls.word_lookup: | |
cls.word_lookup[stemmed] = {} | |
cls.word_lookup[stemmed][word] = ( | |
cls.word_lookup[stemmed].get(word, 0) + 1) | |
return stemmed | |
@classmethod | |
def original_form(cls, word): | |
if word in cls.word_lookup: | |
return max(cls.word_lookup[word].keys(), | |
key=lambda x: cls.word_lookup[word][x]) | |
else: | |
return word | |
#tokenize.sent_tokenize(doc.lower()) | |
class SentenceClass(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for fname in os.listdir(self.dirname): | |
for line in open(os.path.join(self.dirname, fname), 'r'): | |
word_list = re.sub("[^A-Za-z]", " ",line.lower()).split() | |
if len(word_list)> 2: | |
yield word_list | |
# with open(os.path.join(self.dirname,fname), 'r') as myfile: | |
# doc = myfile.read().replace('\n', ' ') | |
# for sent in tokenize.sent_tokenize(doc.lower()): | |
# yield [word for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\ | |
# if word not in stopwords] | |
my_sentences = SentenceClass(data_dir_path) | |
#check | |
#print(sum(1 for _ in my_sentences)) | |
#print(sum(1 for _ in my_sentences)) | |
#print(sum(1 for _ in my_sentences)) | |
google_wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) | |
model = Word2Vec(size=300, min_count=min_count, iter=10) | |
model.build_vocab(my_sentences) | |
training_examples_count = model.corpus_count | |
# below line will make it 1, so saving it before | |
model.build_vocab([list(google_wv.vocab.keys())], update=True) | |
model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0) | |
model.train(my_sentences,total_examples=training_examples_count, epochs=model.iter) | |
model.save("word2vec_model2") | |
#model1 = Word2Vec.load("word2vec_model") | |
model.wv.save("word2vec_model_vectors2") | |
#word_vectors1 = KeyedVectors.load("word2vec_model_vectors") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment