This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function to find outliers using IQR | |
def find_outliers_IQR(df): | |
outlier_indices = [] | |
df = df.select_dtypes(include=['number']) | |
for column in df.columns: | |
Q1 = df[column].quantile(0.25) | |
Q3 = df[column].quantile(0.75) | |
IQR = Q3 - Q1 | |
lower_bound = Q1 - 1.5 * IQR | |
upper_bound = Q3 + 1.5 * IQR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define the expected types | |
expected_types = {'recipe': 'int64', | |
'calories': 'float64', | |
'carbohydrate': 'float64', | |
'sugar': 'float64', | |
'protein': 'float64', | |
'category': 'str', | |
'servings': 'int64', | |
'high_traffic': 'bool' | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def check_missing_data(df): | |
# Check for missing values | |
proportion_null_rows = 100*(round(df.isnull().any(axis=1).sum()/df.any(axis=1).count(),2)) | |
if proportion_null_rows <= 5: | |
print(f"There are {df.isnull().any(axis=1).sum()} rows with a null value. All of them are erased!") | |
df.dropna() | |
else: | |
print("Too many null values, we need to check columns by columns further.") | |
if df.isnull().sum().sum() > 0: | |
print("\nProportion of missing values by column") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Check if there are duplicates | |
def drop_duplicates(df, columns=None): | |
if columns == None: | |
df.drop_duplicates(inplace=True) | |
else: | |
df.drop_duplicates(subset = columns, inplace=False) | |
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function to read data based on file extension | |
def read_data(file_path): | |
_ , file_ext = os.path.splitext(file_path) | |
if file_ext == '.csv': | |
return pd.read_csv(file_path) | |
elif file_ext == '.json': | |
return pd.read_json(file_path) | |
elif file_ext in ['.xls', '.xlsx']: | |
return pd.read_excel(file_path) | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# Select only numeric columns for correlation | |
numeric_df = df.select_dtypes(include=['float64', 'int64']) | |
plt.figure(figsize=(8, 4)) | |
import seaborn as sns | |
sns.heatmap(numeric_df.corr(), annot=True, fmt=".0%") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#let Create a pair plot of some columns | |
df_temp = df.drop(columns=["bill_length_mm", "body_mass_g"]) | |
sns.pairplot(df.iloc[:,:],hue='species') # graph also tell us about the the realationship between the two columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn.linear_model import LogisticRegression | |
# Prepare the data | |
df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True) | |
# Select the features and the target | |
X = df_dummy.iloc[:, 1:3] # Assuming that the features are in the 3rd and 4th columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
import matplotlib.pyplot as plt | |
# One-hot encode the categorical data and sort by flipper_length_mm | |
df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True) | |
# Select the features and the target variable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LinearRegression | |
import random | |
# create linear regression object | |
lr = LinearRegression() | |
# fit linear regression | |
lr.fit(df_multiple[['Height','Gender']], df['Weight']) | |
x_values = np.linspace(55,80,100) |
NewerOlder