Last active
October 9, 2018 12:54
-
-
Save RadoslawB/d05269e05bee5b9cc73ad42ef459a296 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import tarfile | |
def get_data(file_url, file_name='downloaded') | |
urllib.request.urlretrieve(file_url) | |
with tarfile.open(file_name, "r:gz") as f: | |
f.extractall() | |
import pandas as pd | |
def convert_types(df: pd.DataFrame): | |
"""Convert data types in a pandas dataframe. Purpose is to reduce size of dataframe.""" | |
# Iterate through each column | |
for c in df: | |
# Convert ids and booleans to integers | |
if ('ID' in c): | |
df[c] = df[c].fillna(0).astype(np.int32) | |
# Convert objects to category | |
elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]): | |
df[c] = df[c].astype('category') | |
# Booleans mapped to integers | |
elif sorted(list(df[c].unique())) == [0, 1]: | |
df[c] = df[c].astype(bool) | |
# Float64 to float32 | |
elif df[c].dtype == float: | |
df[c] = df[c].astype(np.float32) | |
# Int64 to int32 | |
elif df[c].dtype == int: | |
df[c] = df[c].astype(np.int32) | |
return df | |
import json | |
from pandas.io.json import json_normalize | |
import pandas as pd | |
def load_df(path: str, nrows=None): | |
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource'] | |
converters= {col: json.loads for col in json_cols} | |
df = pd.read_csv(path, dtype=dtype, converters=converters, nrows=nrows) | |
for column in json_cols: | |
column_as_df = json_normalize(df[column]) | |
column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns] | |
df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True) | |
return df | |
def flatten_df_column_multi_index(df: pd.DataFrame, separator='.') -> list: | |
return [".".join(x) for x in df.columns.ravel()] | |
df_adb['start_date'] = pd.to_datetime(df_adb['start_date'].astype(str)) | |
df_adb['diff'] = (df_adb['end_date'] - df_adb['start_date']).dt.days | |
def train_lgbm(X_train: np.ndarray, Y_train: np.ndarray, X_test: np.ndarray): | |
light = LGBMClassifier( | |
nthread=psutil.cpu_count(), | |
n_estimators=850, | |
learning_rate=0.015, | |
reg_alpha=.4, | |
reg_lambda=.2) | |
folds = KFold(n_splits=3, shuffle=True, random_state=101) | |
predictions = np.zeros(shape=(X_test.shape[0])) | |
for train_index, cv_index in folds.split(X_train, Y_train): | |
x_train, x_cv = X_train[train_index], X_train[cv_index] | |
y_train, y_cv = Y_train[train_index], Y_train[cv_index] | |
light.fit( | |
x_train, | |
y_train, | |
eval_set=[(x_cv, y_cv)], | |
eval_names=['cv'], | |
eval_metric='f1_score', | |
verbose=-1, | |
early_stopping_rounds=20) | |
predictions = light.predict(X_test) | |
predictions += predictions | |
predictions = predictions / folds.n_splits | |
return predictions | |
def prepare_dummies(df): | |
categorical_features_types = ['category'] | |
df_categorical_features = df.select_dtypes(include=categorical_features_types) | |
df_non_categorical_features = df.select_dtypes(exclude=categorical_features_types) | |
dummies = pd.get_dummies(df_categorical_features, drop_first=True) | |
df_new = pd.concat([dummies, df_non_categorical_features], axis=1) | |
return df_new |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment