Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Created November 2, 2023 03:18
Show Gist options
  • Save thistleknot/e4b4ad21a342e5bde237aab6a98bd24a to your computer and use it in GitHub Desktop.
Save thistleknot/e4b4ad21a342e5bde237aab6a98bd24a to your computer and use it in GitHub Desktop.
bayes regression using kde
# Since we don't have internet access in this environment, I'll replicate a similar workflow.
# Let's assume the 'data' is a pandas DataFrame obtained from the CSV file.
# For the example, let's create a simulated 'Poverty' column with random data
data_ = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv?token=GHSAT0AAAAAACIYSECGQETPAPO6K4QYIFV6ZKDCJIQ").set_index('States')
for c in data_.columns:
print(c)
data = data_[[c]]
# Step 1: Estimate the distribution
distribution = stats.norm.fit(data[c])
# Or for nonparametric
kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(data[[c]])
# Step 2: Validate the estimated distribution (visual and statistical)
x = np.linspace(data[c].min(), data[c].max(), 1000)
pdf = stats.norm.pdf(x, *distribution)
# For nonparametric, get the log-probability and convert to pdf
logprob = kde.score_samples(x[:, np.newaxis])
kde_pdf = np.exp(logprob)
# Plot for visual inspection
plt.figure(figsize=(12, 7))
plt.hist(data[c], bins=30, density=True, alpha=0.5, color='g', label='Data Histogram')
plt.plot(x, pdf, label='Parametric fit', color='blue')
plt.plot(x, kde_pdf, label='KDE fit', color='red')
plt.legend()
plt.show()
# Step 3: Use the distribution in a model
# For example, in a simulation
simulated_data = stats.norm.rvs(*distribution, size=1000)
# Specify the target and predictors
target = 'Poverty' # Replace with actual target column name
predictors = [col for col in data_.columns if col != target]
# Using KDE to estimate the distribution and generate new samples
kde_models = {}
kde_samples = {}
for c in data_.columns:
# Fit KDE to the data
kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(data_[[c]].values)
kde_models[c] = kde
# Generate new samples from the KDE
new_samples = kde.sample(1000) # Adjust the number of samples as needed
kde_samples[c] = new_samples.flatten()
# Define the likelihood function for the data given the predictors
def likelihood_function(data, coefficients):
predicted = np.dot(data[predictors], coefficients)
return stats.norm.pdf(data[target], loc=predicted, scale=1)
# Find the MAP estimate for the coefficients
def map_estimate(data, kde_samples):
A = np.zeros((len(predictors), len(predictors)))
b = np.zeros(len(predictors))
for i, pred_i in enumerate(predictors):
for j, pred_j in enumerate(predictors):
A[i, j] += (data[pred_i] * data[pred_j]).sum()
b[i] = (data[pred_i] * data[target]).sum()
# Integrate the KDE samples into the MAP estimation
# This is a placeholder for the integration process
# In practice, you would use a Bayesian updating method such as MCMC
coefficients = inv(A).dot(b)
return coefficients
# Assuming that 'Poverty' is the target variable, replace 'Poverty' with the actual target column name
# Compute the MAP estimate for coefficients
coefficients_map = map_estimate(data_, kde_samples)
# Display the estimated coefficients
print(coefficients_map)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment