Created
November 2, 2023 03:18
-
-
Save thistleknot/e4b4ad21a342e5bde237aab6a98bd24a to your computer and use it in GitHub Desktop.
bayes regression using kde
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Since we don't have internet access in this environment, I'll replicate a similar workflow. | |
# Let's assume the 'data' is a pandas DataFrame obtained from the CSV file. | |
# For the example, let's create a simulated 'Poverty' column with random data | |
data_ = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv?token=GHSAT0AAAAAACIYSECGQETPAPO6K4QYIFV6ZKDCJIQ").set_index('States') | |
for c in data_.columns: | |
print(c) | |
data = data_[[c]] | |
# Step 1: Estimate the distribution | |
distribution = stats.norm.fit(data[c]) | |
# Or for nonparametric | |
kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(data[[c]]) | |
# Step 2: Validate the estimated distribution (visual and statistical) | |
x = np.linspace(data[c].min(), data[c].max(), 1000) | |
pdf = stats.norm.pdf(x, *distribution) | |
# For nonparametric, get the log-probability and convert to pdf | |
logprob = kde.score_samples(x[:, np.newaxis]) | |
kde_pdf = np.exp(logprob) | |
# Plot for visual inspection | |
plt.figure(figsize=(12, 7)) | |
plt.hist(data[c], bins=30, density=True, alpha=0.5, color='g', label='Data Histogram') | |
plt.plot(x, pdf, label='Parametric fit', color='blue') | |
plt.plot(x, kde_pdf, label='KDE fit', color='red') | |
plt.legend() | |
plt.show() | |
# Step 3: Use the distribution in a model | |
# For example, in a simulation | |
simulated_data = stats.norm.rvs(*distribution, size=1000) | |
# Specify the target and predictors | |
target = 'Poverty' # Replace with actual target column name | |
predictors = [col for col in data_.columns if col != target] | |
# Using KDE to estimate the distribution and generate new samples | |
kde_models = {} | |
kde_samples = {} | |
for c in data_.columns: | |
# Fit KDE to the data | |
kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(data_[[c]].values) | |
kde_models[c] = kde | |
# Generate new samples from the KDE | |
new_samples = kde.sample(1000) # Adjust the number of samples as needed | |
kde_samples[c] = new_samples.flatten() | |
# Define the likelihood function for the data given the predictors | |
def likelihood_function(data, coefficients): | |
predicted = np.dot(data[predictors], coefficients) | |
return stats.norm.pdf(data[target], loc=predicted, scale=1) | |
# Find the MAP estimate for the coefficients | |
def map_estimate(data, kde_samples): | |
A = np.zeros((len(predictors), len(predictors))) | |
b = np.zeros(len(predictors)) | |
for i, pred_i in enumerate(predictors): | |
for j, pred_j in enumerate(predictors): | |
A[i, j] += (data[pred_i] * data[pred_j]).sum() | |
b[i] = (data[pred_i] * data[target]).sum() | |
# Integrate the KDE samples into the MAP estimation | |
# This is a placeholder for the integration process | |
# In practice, you would use a Bayesian updating method such as MCMC | |
coefficients = inv(A).dot(b) | |
return coefficients | |
# Assuming that 'Poverty' is the target variable, replace 'Poverty' with the actual target column name | |
# Compute the MAP estimate for coefficients | |
coefficients_map = map_estimate(data_, kde_samples) | |
# Display the estimated coefficients | |
print(coefficients_map) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment