Skip to content

Instantly share code, notes, and snippets.

@RadoslawB
Last active October 9, 2018 12:54
Show Gist options
  • Save RadoslawB/d05269e05bee5b9cc73ad42ef459a296 to your computer and use it in GitHub Desktop.
Save RadoslawB/d05269e05bee5b9cc73ad42ef459a296 to your computer and use it in GitHub Desktop.
import urllib.request
import tarfile
def get_data(file_url, file_name='downloaded')
urllib.request.urlretrieve(file_url)
with tarfile.open(file_name, "r:gz") as f:
f.extractall()
import pandas as pd
def convert_types(df: pd.DataFrame):
"""Convert data types in a pandas dataframe. Purpose is to reduce size of dataframe."""
# Iterate through each column
for c in df:
# Convert ids and booleans to integers
if ('ID' in c):
df[c] = df[c].fillna(0).astype(np.int32)
# Convert objects to category
elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
df[c] = df[c].astype('category')
# Booleans mapped to integers
elif sorted(list(df[c].unique())) == [0, 1]:
df[c] = df[c].astype(bool)
# Float64 to float32
elif df[c].dtype == float:
df[c] = df[c].astype(np.float32)
# Int64 to int32
elif df[c].dtype == int:
df[c] = df[c].astype(np.int32)
return df
import json
from pandas.io.json import json_normalize
import pandas as pd
def load_df(path: str, nrows=None):
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']
converters= {col: json.loads for col in json_cols}
df = pd.read_csv(path, dtype=dtype, converters=converters, nrows=nrows)
for column in json_cols:
column_as_df = json_normalize(df[column])
column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
return df
def flatten_df_column_multi_index(df: pd.DataFrame, separator='.') -> list:
return [".".join(x) for x in df.columns.ravel()]
df_adb['start_date'] = pd.to_datetime(df_adb['start_date'].astype(str))
df_adb['diff'] = (df_adb['end_date'] - df_adb['start_date']).dt.days
def train_lgbm(X_train: np.ndarray, Y_train: np.ndarray, X_test: np.ndarray):
light = LGBMClassifier(
nthread=psutil.cpu_count(),
n_estimators=850,
learning_rate=0.015,
reg_alpha=.4,
reg_lambda=.2)
folds = KFold(n_splits=3, shuffle=True, random_state=101)
predictions = np.zeros(shape=(X_test.shape[0]))
for train_index, cv_index in folds.split(X_train, Y_train):
x_train, x_cv = X_train[train_index], X_train[cv_index]
y_train, y_cv = Y_train[train_index], Y_train[cv_index]
light.fit(
x_train,
y_train,
eval_set=[(x_cv, y_cv)],
eval_names=['cv'],
eval_metric='f1_score',
verbose=-1,
early_stopping_rounds=20)
predictions = light.predict(X_test)
predictions += predictions
predictions = predictions / folds.n_splits
return predictions
def prepare_dummies(df):
categorical_features_types = ['category']
df_categorical_features = df.select_dtypes(include=categorical_features_types)
df_non_categorical_features = df.select_dtypes(exclude=categorical_features_types)
dummies = pd.get_dummies(df_categorical_features, drop_first=True)
df_new = pd.concat([dummies, df_non_categorical_features], axis=1)
return df_new
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment