Skip to content

Instantly share code, notes, and snippets.

View tommydangerous's full-sized avatar
🧙
Fire!

DANGerous tommydangerous

🧙
Fire!
View GitHub Profile
@tommydangerous
tommydangerous / impute_values.py
Created June 11, 2021 05:31
impute_values.py
from sklearn.impute import SimpleImputer
print(f'Missing values in "Cabin": {len(df[df["Cabin"].isna()].index)}')
df.loc[df['Cabin'].isna(), 'Cabin'] = 'somewhere out of sight'
df.loc[df['cabin_letter'].isna(), 'cabin_letter'] = 'ZZZ'
print(f'Missing values in "Age": {len(df[df["Age"].isna()].index)}')
age_imputer = SimpleImputer(strategy='median')
df.loc[:, ['Age']] = age_imputer.fit_transform(df[['Age']])
@tommydangerous
tommydangerous / remove_columns.py
Last active June 11, 2021 05:30
remove_columns.py
df = df.drop(columns=['Name', 'PassengerId'])
# Name and PassengerId is no longer a column
df.columns.tolist()
df = X_train_raw.copy()
# Add a column to determine if the person can vote
df['can_vote'] = df['Age'].apply(lambda age: 1 if age >= 18 else 0)
# 892 passengers can vote; aka they are 18 or older
df['can_vote'].value_counts()
# Cabin letter: a cabin can be denoted as B123. The cabin letter will be B.
df.loc[:, 'cabin_letter'] = df['Cabin'].apply(
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
X,
y,
stratify=y,
test_size=0.2,
)
@tommydangerous
tommydangerous / download_and_split_data.py
Created June 11, 2021 05:22
download_and_split_data
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv('/content/titanic_survival.csv')
label_feature_name = 'Survived'
X = df.drop(columns=[label_feature_name])
y = df[label_feature_name]
@tommydangerous
tommydangerous / all_together.py
Last active May 13, 2021 17:36
PySpark example all together
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import (
IntegerType,
StringType,
StructField,
StructType,
)
@tommydangerous
tommydangerous / code_logic.py
Created May 13, 2021 03:28
PySpark example 3
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import (
IntegerType,
StringType,
StructField,
StructType,
)
"""
@tommydangerous
tommydangerous / define_schema.py
Last active May 13, 2021 03:31
PySpark example 2
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import (
IntegerType,
StringType,
StructField,
StructType,
)
"""
@tommydangerous
tommydangerous / define_function.py
Created May 13, 2021 03:20
PySpark example part 1
from pyspark.sql.functions import pandas_udf, PandasUDFType
@pandas_udf(
SCHEMA_COMING_SOON,
PandasUDFType.GROUPED_MAP,
)
def custom_transformation_function(df):
pass
@tommydangerous
tommydangerous / pyspark_load_data_from_s3.py
Last active May 13, 2021 17:27
PySpark load data from S3
from pyspark.sql import SparkSession
def load_data(spark, s3_location):
"""
spark:
Spark session
s3_location:
S3 bucket name and object prefix
"""