Skip to content

Instantly share code, notes, and snippets.

# Function to find outliers using IQR
def find_outliers_IQR(df):
outlier_indices = []
df = df.select_dtypes(include=['number'])
for column in df.columns:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# define the expected types
expected_types = {'recipe': 'int64',
'calories': 'float64',
'carbohydrate': 'float64',
'sugar': 'float64',
'protein': 'float64',
'category': 'str',
'servings': 'int64',
'high_traffic': 'bool'
}
def check_missing_data(df):
# Check for missing values
proportion_null_rows = 100*(round(df.isnull().any(axis=1).sum()/df.any(axis=1).count(),2))
if proportion_null_rows <= 5:
print(f"There are {df.isnull().any(axis=1).sum()} rows with a null value. All of them are erased!")
df.dropna()
else:
print("Too many null values, we need to check columns by columns further.")
if df.isnull().sum().sum() > 0:
print("\nProportion of missing values by column")
# Check if there are duplicates
def drop_duplicates(df, columns=None):
if columns == None:
df.drop_duplicates(inplace=True)
else:
df.drop_duplicates(subset = columns, inplace=False)
return df
# Function to read data based on file extension
def read_data(file_path):
_ , file_ext = os.path.splitext(file_path)
if file_ext == '.csv':
return pd.read_csv(file_path)
elif file_ext == '.json':
return pd.read_json(file_path)
elif file_ext in ['.xls', '.xlsx']:
return pd.read_excel(file_path)
else:
import seaborn as sns
import matplotlib.pyplot as plt
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(8, 4))
import seaborn as sns
sns.heatmap(numeric_df.corr(), annot=True, fmt=".0%")
#let Create a pair plot of some columns
df_temp = df.drop(columns=["bill_length_mm", "body_mass_g"])
sns.pairplot(df.iloc[:,:],hue='species') # graph also tell us about the the realationship between the two columns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# Prepare the data
df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True)
# Select the features and the target
X = df_dummy.iloc[:, 1:3] # Assuming that the features are in the 3rd and 4th columns
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# One-hot encode the categorical data and sort by flipper_length_mm
df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True)
# Select the features and the target variable
from sklearn.linear_model import LinearRegression
import random
# create linear regression object
lr = LinearRegression()
# fit linear regression
lr.fit(df_multiple[['Height','Gender']], df['Weight'])
x_values = np.linspace(55,80,100)