Skip to content

Instantly share code, notes, and snippets.

@ershovio
Created May 2, 2021 19:59
Show Gist options
  • Save ershovio/df72ccda4637a1b7ea4af7b3009eeb91 to your computer and use it in GitHub Desktop.
Save ershovio/df72ccda4637a1b7ea4af7b3009eeb91 to your computer and use it in GitHub Desktop.
An example of data science task
Display the source blob
Display the rendered blob
Raw
#%%
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
from matplotlib import pyplot as plt
from sklearn.metrics import plot_precision_recall_curve
import numpy as np
from sklearn.model_selection import GridSearchCV
#%%
df = pd.read_csv("./data/labeled.csv", sep=",")
#%%
df.shape
#%%
df.head(5)
#%%
df["toxic"] = df["toxic"].apply(int)
#%%
df.head(5)
#%%
df["toxic"].value_counts()
#%%
for c in df[df["toxic"] == 1]["comment"].head(5):
print(c)
#%%
for c in df[df["toxic"] == 0]["comment"].head(5):
print(c)
#%%
train_df, test_df = train_test_split(df, test_size=500)
#%%
test_df.shape
#%%
test_df["toxic"].value_counts()
#%%
train_df["toxic"].value_counts()
#%%
sentence_example = df.iloc[1]["comment"]
tokens = word_tokenize(sentence_example, language="russian")
tokens_without_punctuation = [i for i in tokens if i not in string.punctuation]
russian_stop_words = stopwords.words("russian")
tokens_without_stop_words_and_punctuation = [i for i in tokens_without_punctuation if i not in russian_stop_words]
snowball = SnowballStemmer(language="russian")
stemmed_tokens = [snowball.stem(i) for i in tokens_without_stop_words_and_punctuation]
#%%
print(f"Исходный текст: {sentence_example}")
print("-----------------")
print(f"Токены: {tokens}")
print("-----------------")
print(f"Токены без пунктуации: {tokens_without_punctuation}")
print("-----------------")
print(f"Токены без пунктуации и стоп слов: {tokens_without_stop_words_and_punctuation}")
print("-----------------")
print(f"Токены после стемминга: {stemmed_tokens}")
print("-----------------")
#%%
snowball = SnowballStemmer(language="russian")
russian_stop_words = stopwords.words("russian")
def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
tokens = word_tokenize(sentence, language="russian")
tokens = [i for i in tokens if i not in string.punctuation]
if remove_stop_words:
tokens = [i for i in tokens if i not in russian_stop_words]
tokens = [snowball.stem(i) for i in tokens]
return tokens
#%%
tokenize_sentence(sentence_example)
#%%
vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))
#%%
features = vectorizer.fit_transform(train_df["comment"])
#%%
model = LogisticRegression(random_state=0)
model.fit(features, train_df["toxic"])
#%%
model.predict(features[0])
#%%
train_df["comment"].iloc[0]
#%%
model_pipeline = Pipeline([
("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
("model", LogisticRegression(random_state=0))
]
)
#%%
model_pipeline.fit(train_df["comment"], train_df["toxic"])
#%%
model_pipeline.predict(["Привет, у меня все нормально"])
#%%
model_pipeline.predict(["Слушай не пойти ли тебе нафиг отсюда?"])
#%%
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict(test_df["comment"]))
#%%
recall_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict(test_df["comment"]))
#%%
prec, rec, thresholds = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline.predict_proba(test_df["comment"])[:, 1])
#%%
plot_precision_recall_curve(estimator=model_pipeline, X=test_df["comment"], y=test_df["toxic"])
#%%
np.where(prec > 0.95)
#%%
thresholds[374]
#%%
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(test_df["comment"])[:, 1] > thresholds[374])
#%%
recall_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(test_df["comment"])[:, 1] > thresholds[374])
#%%
grid_pipeline = Pipeline([
("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
("model",
GridSearchCV(
LogisticRegression(random_state=0),
param_grid={'C': [0.1, 1, 10.]},
cv=3,
verbose=4
)
)
])
#%%
grid_pipeline.fit(train_df["comment"], train_df["toxic"])
#%%
model_pipeline_c_10 = Pipeline([
("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
("model", LogisticRegression(random_state=0, C=10.))
]
)
#%%
model_pipeline_c_10.fit(train_df["comment"], train_df["toxic"])
#%%
prec_c_10, rec_c_10, thresholds_c_10 = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline_c_10.predict_proba(test_df["comment"])[:, 1])
#%%
np.where(prec_c_10 > 0.95)
#%%
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline_c_10.predict_proba(test_df["comment"])[:, 1] > thresholds_c_10[316])
#%%
recall_score(y_true=test_df["toxic"], y_pred=model_pipeline_c_10.predict_proba(test_df["comment"])[:, 1] > thresholds_c_10[316])
#%%
@anthony-pipeline
Copy link

The Notebook Does Not Appear to Be Valid JSON...

@arion-git
Copy link

Пересобрал, должно пойти. Если кому надо:
https://drive.google.com/file/d/1UXFXv27ur-H4LxdCPsqTl7uAk-WBB0t2/view?usp=share_link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment