Nikita Furin nokados

## tqdm_pandas.py
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

## clear_punctuation.py
import string
translator = str.maketrans('', '', re.sub(r'[\?-]', '', string.punctuation+'«»”“', flags=re.MULTILINE))
def clear_punctuation(sentence):
    return sentence.translate(translator)

## doc2vec.py
def calc_embedding(text):
    tokens = word_tokenize(text)
    vec = np.zeros(100)
    num_tokens = 0
    for token in tokens:
        if token in stopwords_list:
            continue
        if token in new_model:
            vec += new_model[token]
            num_tokens += 1

## wordcloud.py
%%time
clusters = dbscan.fit(doc2vec_list)

cl_labels = clusters.labels_.tolist()

def wordcloud_cluster_byIds(cluId):
    texts = []
    for i in range(0, len(cl_labels)):
        if cl_labels[i] == cluId:
            for word in word_tokenize(dialogs_concatted.iloc[i].TEXT):

## embedding.py
import numpy as np
from gensim.models import KeyedVectors, Word2Vec
from gensim.models.fasttext import FastText as FT_gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import json
import pandas as pd
from tqdm import trange

W2V_PATH = 'data/GoogleNews-vectors-negative300.bin'

## keras_scores_class.py
import keras.backend as K


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):

## keras_word2vec_embedding.py
def word2vec_embedding_layer(embeddings_path='data/weights.npz', max_review_length=150):
    weights = load_weights(embeddings_path)
    layer = Embedding(input_dim=weights.shape[0],
                      output_dim=weights.shape[1],
                      input_length=max_review_length,
                      weights=[weights])
    return layer

# How to make 'data/weights.npz'? What is load_weights()?
# See https://gist.github.com/nokados/d5cfec00bc194822f89dff556ff62b29

## init_jupyter.py
%load_ext autoreload
%autoreload 2

import pandas as pd
from tqdm import tqdm_notebook, tqdm_pandas, tnrange
import time
import numpy as np
from IPython.display import clear_output
import pickle as pkl
import os

## has.py
def has(expr):
    return lambda x: bool(re.search(expr, x, flags=re.IGNORECASE|re.MULTILINE|re.DOTALL))

## translit.py
# name: это строка которую транслитим
def transliterate(name):
   """
   Автор: LarsKort
   Дата: 16/07/2011; 1:05 GMT-4;
   Не претендую на "хорошесть" словарика. В моем случае и такой пойдет,
   вы всегда сможете добавить свои символы и даже слова. Только
   это нужно делать в обоих списках, иначе будет ошибка.
   """
   # Слоаврь с заменами
	import string
	translator = str.maketrans('', '', re.sub(r'[\?-]', '', string.punctuation+'«»”“', flags=re.MULTILINE))
	def clear_punctuation(sentence):
	return sentence.translate(translator)
	def calc_embedding(text):
	tokens = word_tokenize(text)
	vec = np.zeros(100)
	num_tokens = 0
	for token in tokens:
	if token in stopwords_list:
	continue
	if token in new_model:
	vec += new_model[token]
	num_tokens += 1
	%%time
	clusters = dbscan.fit(doc2vec_list)

	cl_labels = clusters.labels_.tolist()

	def wordcloud_cluster_byIds(cluId):
	texts = []
	for i in range(0, len(cl_labels)):
	if cl_labels[i] == cluId:
	for word in word_tokenize(dialogs_concatted.iloc[i].TEXT):
	import numpy as np
	from gensim.models import KeyedVectors, Word2Vec
	from gensim.models.fasttext import FastText as FT_gensim
	from nltk.tokenize import sent_tokenize, word_tokenize
	import json
	import pandas as pd
	from tqdm import trange

	W2V_PATH = 'data/GoogleNews-vectors-negative300.bin'
	import keras.backend as K


	def recall(y_true, y_pred):
	true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
	possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
	recall = true_positives / (possible_positives + K.epsilon())
	return recall

	def precision(y_true, y_pred):
	def word2vec_embedding_layer(embeddings_path='data/weights.npz', max_review_length=150):
	weights = load_weights(embeddings_path)
	layer = Embedding(input_dim=weights.shape[0],
	output_dim=weights.shape[1],
	input_length=max_review_length,
	weights=[weights])
	return layer

	# How to make 'data/weights.npz'? What is load_weights()?
	# See https://gist.github.com/nokados/d5cfec00bc194822f89dff556ff62b29
	%load_ext autoreload
	%autoreload 2

	import pandas as pd
	from tqdm import tqdm_notebook, tqdm_pandas, tnrange
	import time
	import numpy as np
	from IPython.display import clear_output
	import pickle as pkl
	import os
	def has(expr):
	return lambda x: bool(re.search(expr, x, flags=re.IGNORECASE\|re.MULTILINE\|re.DOTALL))
	# name: это строка которую транслитим
	def transliterate(name):
	"""
	Автор: LarsKort
	Дата: 16/07/2011; 1:05 GMT-4;
	Не претендую на "хорошесть" словарика. В моем случае и такой пойдет,
	вы всегда сможете добавить свои символы и даже слова. Только
	это нужно делать в обоих списках, иначе будет ошибка.
	"""
	# Слоаврь с заменами