Skip to content

Instantly share code, notes, and snippets.

View varrek's full-sized avatar

Ihor Kozlov varrek

View GitHub Profile
te.explain_weights(top=50)
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
import numpy as np
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
hub_layer = hub.KerasLayer(embed, input_shape=[],
dtype=tf.string, trainable=False)
category_counts = len(df_train['label'].value_counts())
model = tf.keras.Sequential()
model.add(hub_layer)
import eli5
from eli5.lime import TextExplainer
target_names = train["targetcat"].unique().tolist()
te = TextExplainer(random_state=42)
te.fit(valid.description.iloc[10], logreg.predict_proba)
te.show_prediction(target_names=target_names)
logreg = Pipeline(steps=[("vectorizer", TfidfVectorizer(max_df=MAX_DF,
min_df=MIN_DF,
stop_words=russian_stopwords,
token_pattern=u'(?ui)\\b\\w*[а-я]+\\w*\\b')),
("log_reg", LogisticRegression(n_jobs=N_JOBS,
solver="saga",
multi_class="multinomial",
random_state=100500))
],
verbose=True)
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
from sklearn.metrics import f1_score
f1_score(df_test['targetcat'], df_test['prediction'],average='macro')
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(df_test['embed'].tolist(), df['cat_embeding'].tolist())
indexes = np.argmax(cos_sim, axis=1)
cats = df.loc[indexes]['cat_name']
df_test['prediction'] = cats.tolist()
import tensorflow_hub as hub
import tensorflow_text
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
df_test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"), header=0)
categories = df_test.targetcat.unique()
df = pd.DataFrame({'cat_name':categories})
df['cat_embeding'] = embed(df['cat_name'])
df['cat_embeding'] = df['cat_embeding'].apply(lambda x: np.asarray(x))
df_test['embed'] = embed(df_test['description'])
df_test['embed'] = df_test['embed'].apply(lambda x: np.asarray(x))
X_train, X_rest, y_train, y_rest = train_test_split(sample.drop(columns_to_drop, axis=1),
sample[label_column],
test_size=0.25,
random_state=random_seed,
stratify=sample[label_column])
X_val, X_test, y_val, y_test = train_test_split(X_rest,
y_rest,
test_size=0.5,
random_state=random_seed,
stratify=y_rest)
features = ["title", "description"]
label_column = "deep_category_minus_one"
sample_size = 10000
df = pd.read_parquet(input_data)
sample = df.sort_values([label_column], ascending=True)\
.groupby(label_column).head(sample_size)
v = sample[label_column].value_counts()
sample = sample[sample[label_column].isin(v.index[v.gt(50)])]
classes = sample[label_column].unique()