This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| te.explain_weights(top=50) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import tensorflow_hub as hub | |
| import tensorflow as tf | |
| import tensorflow_text | |
| import numpy as np | |
| embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3") | |
| hub_layer = hub.KerasLayer(embed, input_shape=[], | |
| dtype=tf.string, trainable=False) | |
| category_counts = len(df_train['label'].value_counts()) | |
| model = tf.keras.Sequential() | |
| model.add(hub_layer) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import eli5 | |
| from eli5.lime import TextExplainer | |
| target_names = train["targetcat"].unique().tolist() | |
| te = TextExplainer(random_state=42) | |
| te.fit(valid.description.iloc[10], logreg.predict_proba) | |
| te.show_prediction(target_names=target_names) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| logreg = Pipeline(steps=[("vectorizer", TfidfVectorizer(max_df=MAX_DF, | |
| min_df=MIN_DF, | |
| stop_words=russian_stopwords, | |
| token_pattern=u'(?ui)\\b\\w*[а-я]+\\w*\\b')), | |
| ("log_reg", LogisticRegression(n_jobs=N_JOBS, | |
| solver="saga", | |
| multi_class="multinomial", | |
| random_state=100500)) | |
| ], | |
| verbose=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| from nltk.corpus import stopwords | |
| nltk.download("stopwords") | |
| russian_stopwords = stopwords.words("russian") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.metrics import f1_score | |
| f1_score(df_test['targetcat'], df_test['prediction'],average='macro') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.metrics.pairwise import cosine_similarity | |
| cos_sim = cosine_similarity(df_test['embed'].tolist(), df['cat_embeding'].tolist()) | |
| indexes = np.argmax(cos_sim, axis=1) | |
| cats = df.loc[indexes]['cat_name'] | |
| df_test['prediction'] = cats.tolist() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import tensorflow_hub as hub | |
| import tensorflow_text | |
| embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3") | |
| df_test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"), header=0) | |
| categories = df_test.targetcat.unique() | |
| df = pd.DataFrame({'cat_name':categories}) | |
| df['cat_embeding'] = embed(df['cat_name']) | |
| df['cat_embeding'] = df['cat_embeding'].apply(lambda x: np.asarray(x)) | |
| df_test['embed'] = embed(df_test['description']) | |
| df_test['embed'] = df_test['embed'].apply(lambda x: np.asarray(x)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| X_train, X_rest, y_train, y_rest = train_test_split(sample.drop(columns_to_drop, axis=1), | |
| sample[label_column], | |
| test_size=0.25, | |
| random_state=random_seed, | |
| stratify=sample[label_column]) | |
| X_val, X_test, y_val, y_test = train_test_split(X_rest, | |
| y_rest, | |
| test_size=0.5, | |
| random_state=random_seed, | |
| stratify=y_rest) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| features = ["title", "description"] | |
| label_column = "deep_category_minus_one" | |
| sample_size = 10000 | |
| df = pd.read_parquet(input_data) | |
| sample = df.sort_values([label_column], ascending=True)\ | |
| .groupby(label_column).head(sample_size) | |
| v = sample[label_column].value_counts() | |
| sample = sample[sample[label_column].isin(v.index[v.gt(50)])] | |
| classes = sample[label_column].unique() |
NewerOlder