Ihor Kozlov varrek

## project_weights.py
te.explain_weights(top=50)

## project_keras_use.py
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
import numpy as np
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
hub_layer = hub.KerasLayer(embed, input_shape=[],
                           dtype=tf.string, trainable=False)
category_counts = len(df_train['label'].value_counts())
model = tf.keras.Sequential()
model.add(hub_layer)

## project_eli5.py
import eli5
from eli5.lime import TextExplainer
target_names = train["targetcat"].unique().tolist()
te = TextExplainer(random_state=42)
te.fit(valid.description.iloc[10], logreg.predict_proba)
te.show_prediction(target_names=target_names)

## project_pipeline.py
logreg = Pipeline(steps=[("vectorizer", TfidfVectorizer(max_df=MAX_DF,
                                                            min_df=MIN_DF,
                                                            stop_words=russian_stopwords,
                                                            token_pattern=u'(?ui)\\b\\w*[а-я]+\\w*\\b')),
                         ("log_reg", LogisticRegression(n_jobs=N_JOBS,
                                                           solver="saga",
                                                           multi_class="multinomial",
                                                           random_state=100500))
                             ],
                  verbose=True)

## project_logreg_stopwords.py
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

## project_use_f1.py
from sklearn.metrics import f1_score
f1_score(df_test['targetcat'], df_test['prediction'],average='macro')

## project_use_2.py
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(df_test['embed'].tolist(), df['cat_embeding'].tolist())
indexes = np.argmax(cos_sim, axis=1)
cats = df.loc[indexes]['cat_name']
df_test['prediction'] = cats.tolist()

## project_use.py
import tensorflow_hub as hub
import tensorflow_text
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
df_test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"), header=0)
categories = df_test.targetcat.unique()
df = pd.DataFrame({'cat_name':categories})
df['cat_embeding'] = embed(df['cat_name'])
df['cat_embeding'] = df['cat_embeding'].apply(lambda x: np.asarray(x))
df_test['embed']  = embed(df_test['description'])
df_test['embed'] = df_test['embed'].apply(lambda x: np.asarray(x))

## project_preprop_2.py
    X_train, X_rest, y_train, y_rest = train_test_split(sample.drop(columns_to_drop, axis=1),
                                                        sample[label_column],
                                                        test_size=0.25,
                                                        random_state=random_seed,
                                                        stratify=sample[label_column])
    X_val, X_test, y_val, y_test = train_test_split(X_rest,
                                                    y_rest,
                                                    test_size=0.5,
                                                    random_state=random_seed,
                                                    stratify=y_rest)

## project_preprop_1.py
features = ["title", "description"]
label_column = "deep_category_minus_one"
sample_size = 10000
df = pd.read_parquet(input_data)
sample = df.sort_values([label_column], ascending=True)\
               .groupby(label_column).head(sample_size)
v = sample[label_column].value_counts()
sample = sample[sample[label_column].isin(v.index[v.gt(50)])]

classes = sample[label_column].unique()
	import tensorflow_hub as hub
	import tensorflow as tf
	import tensorflow_text
	import numpy as np
	embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
	hub_layer = hub.KerasLayer(embed, input_shape=[],
	dtype=tf.string, trainable=False)
	category_counts = len(df_train['label'].value_counts())
	model = tf.keras.Sequential()
	model.add(hub_layer)
	import eli5
	from eli5.lime import TextExplainer
	target_names = train["targetcat"].unique().tolist()
	te = TextExplainer(random_state=42)
	te.fit(valid.description.iloc[10], logreg.predict_proba)
	te.show_prediction(target_names=target_names)
	logreg = Pipeline(steps=[("vectorizer", TfidfVectorizer(max_df=MAX_DF,
	min_df=MIN_DF,
	stop_words=russian_stopwords,
	token_pattern=u'(?ui)\\b\\w[а-я]+\\w\\b')),
	("log_reg", LogisticRegression(n_jobs=N_JOBS,
	solver="saga",
	multi_class="multinomial",
	random_state=100500))
	],
	verbose=True)
	import nltk
	from nltk.corpus import stopwords
	nltk.download("stopwords")
	russian_stopwords = stopwords.words("russian")
	from sklearn.metrics import f1_score
	f1_score(df_test['targetcat'], df_test['prediction'],average='macro')
	from sklearn.metrics.pairwise import cosine_similarity
	cos_sim = cosine_similarity(df_test['embed'].tolist(), df['cat_embeding'].tolist())
	indexes = np.argmax(cos_sim, axis=1)
	cats = df.loc[indexes]['cat_name']
	df_test['prediction'] = cats.tolist()
	X_train, X_rest, y_train, y_rest = train_test_split(sample.drop(columns_to_drop, axis=1),
	sample[label_column],
	test_size=0.25,
	random_state=random_seed,
	stratify=sample[label_column])
	X_val, X_test, y_val, y_test = train_test_split(X_rest,
	y_rest,
	test_size=0.5,
	random_state=random_seed,
	stratify=y_rest)
	features = ["title", "description"]
	label_column = "deep_category_minus_one"
	sample_size = 10000
	df = pd.read_parquet(input_data)
	sample = df.sort_values([label_column], ascending=True)\
	.groupby(label_column).head(sample_size)
	v = sample[label_column].value_counts()
	sample = sample[sample[label_column].isin(v.index[v.gt(50)])]

	classes = sample[label_column].unique()