minesh1291/SampleWeights_Regression.py

## SampleWeights_Regression.py
import pandas as pd
import numpy as np

# Example DataFrame with random target values
df = pd.DataFrame({
    'label': np.random.normal(size=1000)  # 100 random values between 0 and 1
})

# Step 1: Bin the target values to create a frequency distribution
df['label_bin'] = pd.cut(df['label'], bins=10)

# Step 2: Calculate the frequency of each bin
bin_counts = df['label_bin'].value_counts().sort_index()

# Step 3: Calculate the weights inversely proportional to the frequency of each bin
bin_weights = 1 / bin_counts

# Step 4: Assign weights to each sample based on its bin
df['weights'] = df['label_bin'].map(bin_weights)

# Step 5: Normalize weights to the desired range (optional)
min_weight = df['weights'].min()
max_weight = df['weights'].max()

df['normalized_weights'] = 1 + (df['weights'] - min_weight) * (4 - 1) / (max_weight - min_weight)

# Drop the bin column as it's no longer needed
df = df.drop(columns=['label_bin'])

print(df)

plt.hist(df_final_feats['label'], bins=10)
plt.plot(df_final_feats['label'], df_final_feats['normalized_weights'], 'o')
	import pandas as pd
	import numpy as np

	# Example DataFrame with random target values
	df = pd.DataFrame({
	'label': np.random.normal(size=1000) # 100 random values between 0 and 1
	})

	# Step 1: Bin the target values to create a frequency distribution
	df['label_bin'] = pd.cut(df['label'], bins=10)

	# Step 2: Calculate the frequency of each bin
	bin_counts = df['label_bin'].value_counts().sort_index()

	# Step 3: Calculate the weights inversely proportional to the frequency of each bin
	bin_weights = 1 / bin_counts

	# Step 4: Assign weights to each sample based on its bin
	df['weights'] = df['label_bin'].map(bin_weights)

	# Step 5: Normalize weights to the desired range (optional)
	min_weight = df['weights'].min()
	max_weight = df['weights'].max()

	df['normalized_weights'] = 1 + (df['weights'] - min_weight) * (4 - 1) / (max_weight - min_weight)

	# Drop the bin column as it's no longer needed
	df = df.drop(columns=['label_bin'])

	print(df)

	plt.hist(df_final_feats['label'], bins=10)
	plt.plot(df_final_feats['label'], df_final_feats['normalized_weights'], 'o')