Skip to content

Instantly share code, notes, and snippets.

View jmquintana79's full-sized avatar
💭
I may be slow to respond.

Juan Quintana jmquintana79

💭
I may be slow to respond.
View GitHub Profile
@jmquintana79
jmquintana79 / function_transformer.py
Last active July 8, 2024 12:03
Los custom transfomer con los que he trabajado hasta ahora eran desarrollados con una clase. Existe una nueva manera a través de la funcionalidad FunctionTransformer de tal manera que aplicando esta a una función la convierte en un transfomer usable
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
# example
from sklearn.linear_model import LogisticRegression
# X, y
def get_dummies_size(df):
return pd.get_dummies(df, columns=['size'])
@jmquintana79
jmquintana79 / pipeline_template_scikit.py
Last active July 8, 2024 11:55
References: - How to Create Pipelines in Scikit-learn for More Efficient Data Processing: https://www.statology.org/how-create-pipelines-scikit-learn-for-more-efficient-data-processing/
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# example models and preprocessors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
# X, y
class Credentials():
def __init__(self):
self.user = "user"
self.password = "password"
class Service(Credentials):
def __init__(self):
super().__init__()
import pandas as pd
## unstack a timeseries target variable according to a categorical reference column
def unstack_ts_according_to_reference(df:pd.DataFrame, c_dt:str, c_cat_reference:str, c_target_variable:str)->pd.DataFrame:
"""
Unstack a timeseries target variable according to a categorical reference column.
df -- Dataframe to be processed.
c_dt -- Temporal column.
c_cat_reference -- Categorical column to be used as reference to stack the target variable.
c_target_variable -- Num / Cat column to be stacked.
from scipy.stats import linregress
# estimate linear regression y = Ax + B
A, B, r_value, p_value, std_err = linregress(x, y)
# original column
In [15]: df["timedelta_column"]
Out[15]:
0 1 days 00:00:00
1 3 days 02:00:00
2 5 days 04:00:00
3 7 days 06:00:00
4 9 days 08:00:00
5 11 days 10:00:00
dtype: timedelta64[ns]
import os
extension = os.path.splitext(filename)[1]
import os
if os.path.isfile("filename.txt"):
# file exists
f = open("filename.txt")
if os.path.isdir("data"):
# directory exists
if os.path.exists(file_path):
import numpy as np
## angle format: 0/360 to -180/180
def angles_format(angle_0_360:np.array)->np.array:
return np.array([v-360 if v>=180 else v for v in angle_0_360])
## aggregation for angular data
def angles_agg(angle_0_360:np.array, func_agg) -> float:
"""
Calculate wind direction average.