Skip to content

Instantly share code, notes, and snippets.

@zograf
Last active June 1, 2023 22:49
Show Gist options
  • Save zograf/7aefd45d4f594a76d61c9607fc7dee7b to your computer and use it in GitHub Desktop.
Save zograf/7aefd45d4f594a76d61c9607fc7dee7b to your computer and use it in GitHub Desktop.
ORI kolokvijum 2 kod
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans, DBSCAN
import matplotlib.pyplot as plt
#### UTILITY ####
def calculate_rmse(predicted, true):
return np.sqrt(((predicted - true) ** 2).mean())
def remove_outliers(df_in, col_name, scale=1.5):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1
fence_low = q1-scale*iqr
fence_high = q3+scale*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
#### MAIN CODE ####
# Ucitati i proveriti na sta lici
df = pd.read_csv("./data/train.csv")
print(df)
# Proveriti da li su kolone izbalansirane
df["naziv_kolone"].value_counts().plot.bar()
plt.show()
# Izbaciti redove cije kolone koje nemaju vrednosti
df = df.dropna()
# LabelEncoder -> Dodeljuje brojeve kategorickim obelezjima.
# Nekad nije dobro jer neke kategorije dobiju veci broj
lenc = LabelEncoder()
df['naziv_kolone'] = lenc.fit_transform(df['naziv_kolone'])
# OneHotEncoder -> Napravice od pola recimo pol_Male, pol_Female i oblast_A, oblast_b
# dodelice vrednosti True i False kolonama
df = pd.get_dummies(df, columns=['oblast' ,'pol'], drop_first=True)
# Drop first ce droppovati pol_Female ali mozemo i rucno droppovati sa komandom ispod
df = df.drop('pol_Male', axis=1)
# Delimo na trening i testni skup
train, test = train_test_split(df, test_size=0.3, random_state=42)
# Provera boxplotom da li ima outliera
plt.boxplot(train['naziv_kolone'])
plt.show()
# Uklanjanje outliera
train = remove_outliers(train, 'naziv_kolone')
# x_train su kolone na osnovu kojih ucimo (sve sem ove sto predvidjamo)
# y_train je kolona koju predvidjamo
x_train = train.drop("zvanje", axis=1)
y_train = train['zvanje']
# Normalizacija oblika z = (x-mean)/sd
# dobijaju se vrednosti izmedju 0 i 1
st = StandardScaler()
# Fit ide samo na trening podatke
st.fit(x_train)
# Nakon fita transformisati trening podatke
x_train[x_train.columns] = st.transform(x_train[x_train.columns])
# Delimo testni skup kao malopre trening skup
x_test = test.drop("zvanje", axis=1)
y_test = test['zvanje']
# odraditi SAMO transformaciju (bez fita) nad test podacima
x_test[x_test.columns] = st.transform(x_test[x_test.columns])
# Trebalo bi imati 1-5 layera
# Layeri da budu stepen dvojke i da opadaju (recimo [64, 32])
# Ukupan broj neurona u layerima da bude oko ~128
# Learning rate da bude izmedju 10^-1 i 10^-6
# Koristiti 0.01 najcesce za ove male mreze, 0.001 za neke dublje mreze
# KLASIFIKACIJA
mlp = MLPClassifier(hidden_layer_sizes=[50,50,20],learning_rate_init=0.01, max_iter=1000,verbose=True, random_state=42).fit(x_train, y_train)
# REGRESIJA
mlp = MLPRegressor(hidden_layer_sizes=[50, 50, 20], learning_rate_init=0.01, max_iter=20000, verbose=True, random_state=42).fit(x_train, y_train)
y_pred = mlp.predict(x_test)
# Crtanje krive da vidimo kako parametri uticu na podatke
plt.plot(np.arange(len(mlp.loss_curve_)), mlp.loss_curve_)
plt.show()
# F1 metrika (klasifikacija)
F1 = f1_score(y_pred, y_test, average='micro')
print(f'F1: {F1}')
# RMSE metrika (regresija)
RMSE = calculate_rmse(y_pred, y_test)
print(f'RMSE: {RMSE}')
#### NAIVNI BAJES ####
# Enkodovanje labela
df['Character'] = LabelEncoder().fit_transform(df['Character'])
# Transformacija teksta
df['Line'] = df['Line'].apply(lambda x: x.lower())
df['Line'] = df['Line'].apply(lambda x: re.sub(r"[^\w\s]","", x))
df['Line'] = df['Line'].apply(lambda x: " ".join([word for word in x.split() if len(word) >= 2]))
# Mozemo i ovako odvojiti train i test, ako unapred zadamo koji je y
x_train, x_test, y_train, y_test = train_test_split(df['Line'], df['Character'], test_size=0.2, random_state=496)
# Izbaciti stop reci, ngram_range cini koliko reci posmatramo
# da li same reci, grupe od 2 reci, od 3...
vect = CountVectorizer(stop_words="english", ngram_range=((1,2)))
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)
# Alternativa za CountVectorizer
vect = TfidfVectorizer(stop_words='english', ngram_range=((1,1)))
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)
# Primena bayesa
nb = MultinomialNB().fit(x_train, y_train)
y_pred = nb.predict(x_test)
# METRIKA
accuracy = accuracy_score(y_pred, y_test)
print(f'Accuracy: {accuracy}')
#### KLASTEROVANJE ####
df = pd.read_csv('bank.csv')
print(df)
lenc = LabelEncoder()
df['id'] = lenc.fit_transform(df['id'])
df['sex'] = lenc.fit_transform(df['sex'])
df['region'] = lenc.fit_transform(df['region'])
df['married'] = lenc.fit_transform(df['married'])
df['car'] = lenc.fit_transform(df['car'])
df['save_act'] = lenc.fit_transform(df['save_act'])
df['current_act'] = lenc.fit_transform(df['current_act'])
df['mortgage'] = lenc.fit_transform(df['mortgage'])
df['pep'] = lenc.fit_transform(df['pep'])
print(df)
X = df[['id', 'age', 'sex', 'region', 'income', 'married', 'children', 'car', 'save_act', 'current_act', 'mortgage', 'pep']]
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_
dbscan = DBSCAN(eps=0.02, min_samples=5)
dbscan.fit(X)
labels = dbscan.labels_
plt.scatter(X['age'], X['income'], c=labels)
plt.show()
### RESENJE ZADATAKA SA PRIPREME ###
# PRVI #
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
df = pd.read_csv('customer_churn.csv')
X = df[['total intl minutes']]
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mlp = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', random_state=42)
mlp.fit(X_train, y_train)
# Predict the probabilities for new values
new_X = [[5], [60]] # New input values
predictions = mlp.predict(new_X)
for i, x in enumerate(new_X):
print(f"{x[0]} minuta: {predictions[i]}")
# DRUGI #
# Učitavanje podataka iz CSV fajla
df = pd.read_csv('customer_churn.csv')
X = df[['total intl minutes', 'total day minutes']]
# Kreiranje instance KMeans modela sa 2 klastera
kmeans = KMeans(n_clusters=2, random_state=42)
# Klasterovanje podataka
kmeans.fit(X)
# Dodavanje informacija o klasterima u DataFrame
df['cluster'] = kmeans.labels_
# Brojanje zaposlenih koji su napustili kompaniju po klasterima
churn_counts = df.groupby('cluster')['churn'].sum()
# Ukupan broj zaposlenih po klasterima
total_counts = df['cluster'].value_counts()
# Izračunavanje procenata napuštanja kompanije po klasterima
churn_percentages = churn_counts / total_counts * 100
# Ispisivanje rezultata
for cluster, percentage in churn_percentages.items():
print(f"Procenat napuštanja kompanije: {percentage}%")
# Plotiranje klastera
plt.scatter(X['total intl minutes'], X['total day minutes'], c=df['cluster'])
plt.show()
# TRECI #
data = pd.read_csv('customer_churn.csv')
lenc = LabelEncoder()
data['international plan'] = lenc.fit_transform(df['international plan'])
data['voice mail plan'] = lenc.fit_transform(df['voice mail plan'])
# Select the relevant features and target variable
X = data[['international plan', 'voice mail plan', 'number vmail messages', 'total intl calls', 'total night calls', 'total day calls']]
y = data['churn']
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create an instance of the MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', random_state=42)
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test set
predictions = model.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
# Print the accuracy
print("Accuracy on the test set:", accuracy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment