Skip to content

Instantly share code, notes, and snippets.

View mesutcanalkan's full-sized avatar

mesutcanalkan

  • QbizUK
  • London
View GitHub Profile
def calculate_roc_auc(model_pipe, X, y):
"""Calculate roc auc score.
Parameters:
===========
model_pipe: sklearn model or pipeline
X: features
y: true target
"""
y_proba = model_pipe.predict_proba(X)[:,1]
pipeline = Pipeline([
('dtype_1', FeatureTransformer(features=INTEGER_COLS, assign_dtype=int)),
('dtype_2', FeatureTransformer(features=FLOAT_COLS, assign_dtype=float)),
('dtype_3', FeatureTransformer(features=OBJECT_COLS, assign_dtype='object')),
('basic_transformations_1', FeatureTransformer(features=['deck', 'embark_town'], strlowercase=True)),
('basic_transformations_2', FeatureTransformer(features=['embark_town'], remove_whitespace=True)),
('num_imputer', Imputer(NUMERICAL_COLS, method='mean')),
('scaler', Scaler(NUMERICAL_COLS)),
('cat_encoder', Encoder(OBJECT_COLS, encoding='onehotencoder')),
('pca', PCA(n_components=0.95)),
models_for_gridsearch = {
'LogisticRegression': LogisticRegression(),
'KNeighborsClassifier': KNeighborsClassifier(),
# 'DecisionTreeClassifier': DecisionTreeClassifier(),
# 'ExtraTreesClassifier': ExtraTreesClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
# 'AdaBoostClassifier': AdaBoostClassifier(),
# 'GradientBoostingClassifier': GradientBoostingClassifier(),
'SVC': SVC(),
}
class FeatureTransformer(BaseEstimator, TransformerMixin):
def __init__(self, features, assign_dtype=None, strlowercase=False, remove_whitespace=False): # no *args or **kargs
self.features = features
self.assign_dtype = assign_dtype
self.remove_whitespace = remove_whitespace
self.strlowercase = strlowercase
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
if self.assign_dtype is not None:
# Data manipulation
import seaborn as sns
import numpy as np
import pandas as pd
pd.options.display.precision = 4
pd.options.mode.chained_assignment = None
from sklearn import set_config
# Machine learning pipeline
from sklearn.pipeline import Pipeline
WITH date_table AS ( WITH recursive date_in_range(day) AS ( VALUES('2019-12-30')
UNION ALL
SELECT date(day, '+7 day')
FROM date_in_range
WHERE date(day, '+7 day') < '2020-10-19' )
SELECT strftime('%Y-%W', day) AS year_week,
row_number() OVER(ORDER BY day) AS yw_id
FROM date_in_range)
SELECT table_for_flags.user_id ,
table_for_flags.year_week ,
# !pip install pulp
import pulp as plp
# %matplotlib inline
from matplotlib import pyplot as plt
import cv2
import numpy as np
from pytesseract import image_to_string
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract/'
# !pip install streamlit==0.72.0
@mesutcanalkan
mesutcanalkan / Seasonal Gamelog.py
Last active November 9, 2021 23:31
2000-01 Season for 2000 Drafted Players
df_Players_Drafted_2000 = df_Players[
df_Players['DRAFT YEAR']=='2000'
][
['PLAYER', 'TEAM', 'AGE', 'HEIGHT', 'WEIGHT', 'COLLEGE COUNTRY','DRAFT YEAR', 'ESPN_GAMELOG_ID']
].reset_index(drop=True)
SEASON_2000_2001_CAREER_LIST_TO_CONCAT = []
for INDEX, ROW in df_Players_Drafted_2000.iterrows():
@mesutcanalkan
mesutcanalkan / yearly_player_gamelog.py
Last active November 9, 2021 23:24
Player Gamelog
def get_yearly_player_gamelog(PLAYER_URL, YEAR, PLAYER_NAME):
try:
HTML_RESULTS = pd.read_html('{}{}'.format(PLAYER_URL, YEAR))
LEN_HTML_RESULTS = len(HTML_RESULTS)
EMPTY_LIST = []
@mesutcanalkan
mesutcanalkan / google_search_player_id.py
Last active November 9, 2021 22:51
google search for nba players
for INDEX, ROW in df_Players.iterrows():
# In order not to get the HTTP 429 Too Many Requests error, we're sleeping the script for some time.
time.sleep(5)
try:
site= 'https://www.google.com/search?q={}+nba+espn+gamelog'.format(df_Players.loc[INDEX,
'PLAYER'].replace(' ', '+'))