Skip to content

Instantly share code, notes, and snippets.

View kvnkho's full-sized avatar
👋
Feel free to message me. Contact info in profile

Kevin Kho kvnkho

👋
Feel free to message me. Contact info in profile
View GitHub Profile
@kvnkho
kvnkho / median.py
Last active March 17, 2021 18:56
Simple median
#schema: user_id:int, measurement:int
def get_median(df:pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame({'user_id': [df.iloc[0]['user_id']],
'median' : [df[['measurement']].median()]})
@kvnkho
kvnkho / comparison.py
Last active April 18, 2021 21:24
Comparing Pandas and Spark
# Comparison of creating inferred_state column
area_to_state = {"217": "IL", "312": "IL", "415": "CA", "352": "FL"}
# Pandas implementation
df['inferred_state'] = df['home_state']\
.fillna(df['work_state'])\
.fillna(df['phone'].str.slice(0,3).map(area_to_state))
# Spark implementation
from pyspark.sql.functions import coalesce, col, substring, create_map, lit
@kvnkho
kvnkho / fugue.py
Last active April 18, 2021 21:37
Fugue Example
# Import statements
from fugue import FugueWorkflow, SparkExecutionEngine
from typing import List, Any, Dict, Iterable
# Area code to state
area_to_state = {"217": "IL", "312": "IL", "415": "CA", "352": "FL"}
# schema: *, inferred_state:str
def fill_location(df:Iterable[Dict[str,Any]]) -> Iterable[Dict[str,Any]]:
for row in df:
import pandera as pa
price_check = pa.DataFrameSchema({
"Price": pa.Column(pa.Int, pa.Check.in_range(min_value=5,max_value=20)),
})
# schema: *
def price_validation(df:pd.DataFrame) -> pd.DataFrame:
price_check.validate(df)
return df
import pandera as pa
price_check = pa.DataFrameSchema({
"Price": pa.Column(pa.Int, pa.Check.in_range(min_value=5,max_value=20)),
})
# schema: *
def price_validation(df:pd.DataFrame) -> pd.DataFrame:
price_check.validate(df)
return df
import pandas as pd
df = pd.DataFrame({'State': ['FL','FL','FL','CA','CA','CA'],
'City': ['Tampa', 'Orlando', 'Miami', 'Oakland', 'San Francisco', 'San Jose'],
'Price': [8, 12, 10, 16, 20, 16]})
import pandera as pa
from pandera import Column, Check, DataFrameSchema
from fugue import FugueWorkflow
from fugue_spark import SparkExecutionEngine
price_check_FL = pa.DataFrameSchema({
"Price": Column(pa.Float, Check.in_range(min_value=7,max_value=13)),
})
price_check_CA = pa.DataFrameSchema({
This file has been truncated, but you can view the full file.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
from pycaret.classification import *
clf = setup(data = df,
target = "Survived",
session_id = 123,
silent = True,
verbose = False,
html = False)
models = compare_models(fold = 5,
from fugue import transform
import pandas as pd
schema = """Model:str, Accuracy:float, AUC:float, Recall:float, Prec:float,
F1:float, Kappa:float, MCC:float, TT_Sec:float"""
def wrapper(df: pd.DataFrame) -> pd.DataFrame:
clf = setup(data = df,
target = 'Survived',
session_id=123,