Skip to content

Instantly share code, notes, and snippets.

View rkdgusrn1212's full-sized avatar

강현구 (Hyungu Kang) rkdgusrn1212

View GitHub Profile
@rkdgusrn1212
rkdgusrn1212 / ExpCalculator.java
Created August 17, 2022 07:43
자바 수식 계산기(연산자 우선순위 고려)
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Stack;
/**
* 피연산자 여러개 연산 가능합니다.
*
* ***예제1***
* 수식을 한 줄에 입력해주세요.
@rkdgusrn1212
rkdgusrn1212 / m_estimate_encoder.py
Last active June 14, 2022 09:23
Machine Learning : M-Estimate
for col in df.columns :
if df[col].dtype == "object" and df[col].nunique() > 10 :
print(col) #카테고리가 많은 feature일수록 target encoding이 필요하다
if df[col].value_counts().any() < 5 :
print("required smoothing") #rare category가 존재하는 feature들은 smoothing을 적용한다.
#encoding split이랑 train split 나누기
X_encode = df.sample(frac=0.20, random_state=0)#인코더 fitting용, encoding split
y_encode = X_encode.pop("target")#인코더로 transfrom할거, train split
@rkdgusrn1212
rkdgusrn1212 / pca.py
Last active June 8, 2022 04:14
PCA(Principal Component Analysis)
from sklearn.decomposition import PCA
X = (X - X.mean(axis=0)) / X.std(axis=0)
pca = PCA()
X_pca = pca.fit_transform(X)
#X_pca는 2차원 ndarray다.row=index,column=principle component
#pca.component_로 row=principle component col=feature인 2차원 배열을 얻을 수 있다.
from sklearn.cluster import KMeans
#features 컬럼들을 평균 0, 표준편차 1로 표준화함.
X_scaled = X.loc[:, features]#[:]전채 행에서 features열만 가지고 군집화함.
X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)#각 열에서의 행들의 평균, 행들의 표준편차
#n_cluster는 군집 개수, n_init은 다른 랜덤 centeroid를 가지고 알고리즘을 수행될 횟수로 그 중 가장 군집화가 잘된 결과를 반환한다.
kmeans = KMeans(n_clusters=10, n_init=10)
X["Cluster"] = kmeans.fit_predict(X_scaled)#군집화 정보를 다시 학습 데이터에 넣어준다.
@rkdgusrn1212
rkdgusrn1212 / feature_engineering_skills.py
Created June 7, 2022 09:54
Feature Engineering Skills
#categorical x numerical
df_new = pd.get_dummies(df.cat_feat, prefix="catxnum_feat").mul(df.num_feat, axis=0)
#count columns gt 0
df_new = pd.DataFrame();
df_new["count"] = df[["feat_1","feat_2",]].gt(0.0).sum(axis=1)
#categorical feature의 각 카테고리가 "_" 기준으로 3개의 feature로 나누어 질때
df_new = pd.DataFrame();
df_new[["cat_feat_1","cat_feat_2","cat_feat_3"]] = df.cat_feat.str.split("_", n=2, expand=True)
@rkdgusrn1212
rkdgusrn1212 / mutual_information.py
Last active June 7, 2022 04:53
Mutual Information
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
#클수록 성능이 안좋다는 의미이므로, mae를 음수로 반환한다.
scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
#read data
train_data = pd.read_csv('train.csv', index_col='Id')
test_data = pd.read_csv('test.csv', index_col='Id')
@rkdgusrn1212
rkdgusrn1212 / one_hot_encoding.py
Last active June 1, 2022 01:46
One-Hot encoding
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#너무 많은 차원으로 분리되지 않을 컬럼들만 인코딩한다.
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(encoder.transform(X_valid[low_cardinality_cols]))
@rkdgusrn1212
rkdgusrn1212 / ordinal_encode_categorical_col.py
Created May 31, 2022 17:50
Ordinal Encode Categorical Column
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
# Categorical columns
object_cols = [col for col in train_data.columns if train_data[col].dtype == "object"]
# valid_data에서의 value set이 train_data에서의 value set의 부분집합인 column들, train_data에 fit된 encoder를 사용하기위한 필요조건이다.
good_cols = [col for col in object_cols if set(valid_data[col]).issubset(set(train_data[col]))]
ordinal_encoder = OrdinalEncoder()