Skip to content

Instantly share code, notes, and snippets.

View yaronv's full-sized avatar

Yaron Vazana yaronv

View GitHub Profile
from colorama import Fore
from IPython.display import clear_output
from IPython.display import display
from ipywidgets import Output
def chatbot():
quit=False
responses = []
while quit == False:
%%time
assert gensim.models.doc2vec.FAST_VERSION > -1
print('Training the model...')
cores = multiprocessing.cpu_count()
texts = MyTexts()
doc2vec_model = Doc2Vec(vector_size=300, workers=cores, min_count=1, window=3, negative=5)
doc2vec_model.build_vocab(texts)
doc2vec_model.train(texts, total_examples=doc2vec_model.corpus_count, epochs=20)
if not os.path.exists('models'):
for doc in corpus:
vec = get_mean_vector(model, doc.words)
if len(vec) > 0:
# do somthing with the vector ${vec}
def get_mean_vector(word2vec_model, words):
# remove out-of-vocabulary words
words = [word for word in words if word in word2vec_model.vocab]
if len(words) >= 1:
return np.mean(word2vec_model[words], axis=0)
else:
return []
import gensim
# set the correct path to the file on your machine
model = gensim.models.KeyedVectors.load_word2vec_format('data/wiki.en.vec', binary=False)
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
class MyCorpus():
def __init__(self, train_data):
self.train_data = train_data
def __iter__(self):
p = PorterStemmer()
for i in range(len(self.train_data)):
train_data = pd.DataFrame(columns = ['id','text','response','name'])
prev_msg = ''
for index, row in df.iterrows():
if prev_msg != '':
tmp = pd.DataFrame({'text': [prev_msg], 'response': [row['message']], 'id': [row['id']], 'name': [row['name']]})
train_data = train_data.append(tmp[['id','text','response','name']], ignore_index=True)
prev_msg = row['message']
display(train_data)
def ismessage(self, str):
patterns = {
"pattern1":r'(\d{1,2}/\d{1,2}/\d{2,4}),\s+(\d{2}:\d{2})\s*-\s*(\w*\s*\w*)\s*:\s*(.*)'
}
for key in patterns:
r = re.search(patterns[key], str)
if r != None:
date = r.group(1)
class Doc2VecTrainer(object):
def __init__(self, train_corpus):
self.train_corpus = train_corpus
def run(self):
print('app started')
cores = multiprocessing.cpu_count()
print('num of cores is %s' % cores)
gc.collect()
@yaronv
yaronv / corpus.py
Last active September 5, 2018 06:32
stream documents one by one from the disc
class MyCorpus(object):
def __iter__(self):
for line in open('mycorpus.txt'):
# assume there's one document per line, tokens separated by whitespace
yield dictionary.doc2bow(line.lower().split())