Yaron Vazana yaronv

## chatbot-loop.py
from colorama import Fore
from IPython.display import clear_output
from IPython.display import display
from ipywidgets import Output

def chatbot():
    quit=False
    responses = []

    while quit == False:

## train-chatbot.py
%%time
assert gensim.models.doc2vec.FAST_VERSION > -1
print('Training the model...')
cores = multiprocessing.cpu_count()
texts = MyTexts()
doc2vec_model = Doc2Vec(vector_size=300, workers=cores, min_count=1, window=3, negative=5)
doc2vec_model.build_vocab(texts)
doc2vec_model.train(texts, total_examples=doc2vec_model.corpus_count, epochs=20)

if not os.path.exists('models'):

## generate-mean-vector.py
for doc in corpus:
    vec = get_mean_vector(model, doc.words)
    if len(vec) > 0:
      # do somthing with the vector ${vec}

## calculate-mean-word-vectors.py
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

## load-word2vec-model.py
import gensim
# set the correct path to the file on your machine
model = gensim.models.KeyedVectors.load_word2vec_format('data/wiki.en.vec', binary=False)

## corpus-dataframe.py
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords

class MyCorpus():
    def __init__(self, train_data):
        self.train_data = train_data

    def __iter__(self):
        p = PorterStemmer()
        for i in range(len(self.train_data)):

## prepare_whatsapp_train_data.py
train_data = pd.DataFrame(columns = ['id','text','response','name'])
prev_msg = ''
for index, row in df.iterrows():
    if prev_msg != '':
        tmp = pd.DataFrame({'text': [prev_msg], 'response': [row['message']], 'id': [row['id']], 'name': [row['name']]})
        train_data = train_data.append(tmp[['id','text','response','name']], ignore_index=True)
    prev_msg = row['message']
display(train_data)

## is_whatsapp_message.py

def ismessage(self, str):
  patterns = {
    "pattern1":r'(\d{1,2}/\d{1,2}/\d{2,4}),\s+(\d{2}:\d{2})\s*-\s*(\w*\s*\w*)\s*:\s*(.*)'
  }
  for key in patterns:
      r = re.search(patterns[key], str)

      if r != None:
          date = r.group(1)

## doc2vec_train.py
class Doc2VecTrainer(object):
    def __init__(self, train_corpus):
        self.train_corpus = train_corpus

    def run(self):
        print('app started')

        cores = multiprocessing.cpu_count()
        print('num of cores is %s' % cores)
        gc.collect()

## corpus.py
class MyCorpus(object):
    def __iter__(self):
    for line in open('mycorpus.txt'):
        # assume there's one document per line, tokens separated by whitespace
        yield dictionary.doc2bow(line.lower().split())
	from colorama import Fore
	from IPython.display import clear_output
	from IPython.display import display
	from ipywidgets import Output

	def chatbot():
	quit=False
	responses = []

	while quit == False:
	%%time
	assert gensim.models.doc2vec.FAST_VERSION > -1
	print('Training the model...')
	cores = multiprocessing.cpu_count()
	texts = MyTexts()
	doc2vec_model = Doc2Vec(vector_size=300, workers=cores, min_count=1, window=3, negative=5)
	doc2vec_model.build_vocab(texts)
	doc2vec_model.train(texts, total_examples=doc2vec_model.corpus_count, epochs=20)

	if not os.path.exists('models'):
	for doc in corpus:
	vec = get_mean_vector(model, doc.words)
	if len(vec) > 0:
	# do somthing with the vector ${vec}
	def get_mean_vector(word2vec_model, words):
	# remove out-of-vocabulary words
	words = [word for word in words if word in word2vec_model.vocab]
	if len(words) >= 1:
	return np.mean(word2vec_model[words], axis=0)
	else:
	return []
	import gensim
	# set the correct path to the file on your machine
	model = gensim.models.KeyedVectors.load_word2vec_format('data/wiki.en.vec', binary=False)
	from gensim.parsing.porter import PorterStemmer
	from gensim.parsing.preprocessing import remove_stopwords

	class MyCorpus():
	def __init__(self, train_data):
	self.train_data = train_data

	def __iter__(self):
	p = PorterStemmer()
	for i in range(len(self.train_data)):
	train_data = pd.DataFrame(columns = ['id','text','response','name'])
	prev_msg = ''
	for index, row in df.iterrows():
	if prev_msg != '':
	tmp = pd.DataFrame({'text': [prev_msg], 'response': [row['message']], 'id': [row['id']], 'name': [row['name']]})
	train_data = train_data.append(tmp[['id','text','response','name']], ignore_index=True)
	prev_msg = row['message']
	display(train_data)

	def ismessage(self, str):
	patterns = {
	"pattern1":r'(\d{1,2}/\d{1,2}/\d{2,4}),\s+(\d{2}:\d{2})\s-\s(\w\s\w)\s:\s(.)'
	}
	for key in patterns:
	r = re.search(patterns[key], str)

	if r != None:
	date = r.group(1)
	class Doc2VecTrainer(object):
	def __init__(self, train_corpus):
	self.train_corpus = train_corpus

	def run(self):
	print('app started')

	cores = multiprocessing.cpu_count()
	print('num of cores is %s' % cores)
	gc.collect()
	class MyCorpus(object):
	def __iter__(self):
	for line in open('mycorpus.txt'):
	# assume there's one document per line, tokens separated by whitespace
	yield dictionary.doc2bow(line.lower().split())