Josep Ferrer Sánchez rfeers

## data_cleaning_5.py
# Function to find outliers using IQR
def find_outliers_IQR(df):
    outlier_indices = []
    df = df.select_dtypes(include=['number'])
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

## data_cleaning_4.py
# define the expected types
expected_types = {'recipe': 'int64',
                  'calories': 'float64',
                  'carbohydrate': 'float64',
                  'sugar': 'float64',
                  'protein': 'float64',
                  'category': 'str',
                  'servings': 'int64',
                  'high_traffic': 'bool'
                  }

## data_cleaning_3.py
def check_missing_data(df):
    # Check for missing values
    proportion_null_rows = 100*(round(df.isnull().any(axis=1).sum()/df.any(axis=1).count(),2))
    if proportion_null_rows <= 5:
        print(f"There are {df.isnull().any(axis=1).sum()} rows with a null value. All of them are erased!")
        df.dropna()
    else:
        print("Too many null values, we need to check columns by columns further.")
        if df.isnull().sum().sum() > 0:
            print("\nProportion of missing values by column")

## data_cleaning_2.py
# Check if there are duplicates
def drop_duplicates(df, columns=None):
	if columns == None:
		df.drop_duplicates(inplace=True)
	else:
		df.drop_duplicates(subset = columns, inplace=False)
	return df

## data_cleaning_1.py
# Function to read data based on file extension
def read_data(file_path):
    _ , file_ext = os.path.splitext(file_path)
    if file_ext == '.csv':
        return pd.read_csv(file_path)
    elif file_ext == '.json':
        return pd.read_json(file_path)
    elif file_ext in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    else:

## log_reg_seaborn_heatmap.py
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(8, 4))

import seaborn as sns
sns.heatmap(numeric_df.corr(), annot=True, fmt=".0%")

## log_reg_pair_plot.py
#let Create a pair plot of some columns
df_temp = df.drop(columns=["bill_length_mm", "body_mass_g"])
sns.pairplot(df.iloc[:,:],hue='species')  # graph also  tell us about the the realationship between the two columns

## Logistic_reg_Class_2.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

# Prepare the data
df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True)

# Select the features and the target
X = df_dummy.iloc[:, 1:3]  # Assuming that the features are in the 3rd and 4th columns

## Logistic_reg_Class_1.py
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# One-hot encode the categorical data and sort by flipper_length_mm
df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True)

# Select the features and the target variable

## MLR_IV.py
from sklearn.linear_model import LinearRegression
import random

# create linear regression object
lr = LinearRegression()

# fit linear regression
lr.fit(df_multiple[['Height','Gender']], df['Weight'])

x_values = np.linspace(55,80,100)
	# Function to find outliers using IQR
	def find_outliers_IQR(df):
	outlier_indices = []
	df = df.select_dtypes(include=['number'])
	for column in df.columns:
	Q1 = df[column].quantile(0.25)
	Q3 = df[column].quantile(0.75)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR
	# define the expected types
	expected_types = {'recipe': 'int64',
	'calories': 'float64',
	'carbohydrate': 'float64',
	'sugar': 'float64',
	'protein': 'float64',
	'category': 'str',
	'servings': 'int64',
	'high_traffic': 'bool'
	}
	def check_missing_data(df):
	# Check for missing values
	proportion_null_rows = 100*(round(df.isnull().any(axis=1).sum()/df.any(axis=1).count(),2))
	if proportion_null_rows <= 5:
	print(f"There are {df.isnull().any(axis=1).sum()} rows with a null value. All of them are erased!")
	df.dropna()
	else:
	print("Too many null values, we need to check columns by columns further.")
	if df.isnull().sum().sum() > 0:
	print("\nProportion of missing values by column")
	# Check if there are duplicates
	def drop_duplicates(df, columns=None):
	if columns == None:
	df.drop_duplicates(inplace=True)
	else:
	df.drop_duplicates(subset = columns, inplace=False)
	return df
	# Function to read data based on file extension
	def read_data(file_path):
	_ , file_ext = os.path.splitext(file_path)
	if file_ext == '.csv':
	return pd.read_csv(file_path)
	elif file_ext == '.json':
	return pd.read_json(file_path)
	elif file_ext in ['.xls', '.xlsx']:
	return pd.read_excel(file_path)
	else:
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Select only numeric columns for correlation
	numeric_df = df.select_dtypes(include=['float64', 'int64'])

	plt.figure(figsize=(8, 4))

	import seaborn as sns
	sns.heatmap(numeric_df.corr(), annot=True, fmt=".0%")
	#let Create a pair plot of some columns
	df_temp = df.drop(columns=["bill_length_mm", "body_mass_g"])
	sns.pairplot(df.iloc[:,:],hue='species') # graph also tell us about the the realationship between the two columns
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.linear_model import LogisticRegression

	# Prepare the data
	df_dummy = pd.get_dummies(df, dtype=int).sort_values(by='flipper_length_mm', ascending=True)

	# Select the features and the target
	X = df_dummy.iloc[:, 1:3] # Assuming that the features are in the 3rd and 4th columns
	from sklearn.linear_model import LinearRegression
	import random

	# create linear regression object
	lr = LinearRegression()

	# fit linear regression
	lr.fit(df_multiple[['Height','Gender']], df['Weight'])

	x_values = np.linspace(55,80,100)