Skip to content

Instantly share code, notes, and snippets.

@tiandiao123
Created February 20, 2024 01:49
Show Gist options
  • Save tiandiao123/fef70dd65e77cf369c98979cdc95bbef to your computer and use it in GitHub Desktop.
Save tiandiao123/fef70dd65e77cf369c98979cdc95bbef to your computer and use it in GitHub Desktop.
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset, TensorDataset
# Assuming the model is a simple neural network for regression/classification
class SimpleNN(nn.Module):
def __init__(self, input_size, output_size):
super(SimpleNN, self).__init__()
self.fc1 = nn.Linear(input_size, 64)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(64, output_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
def feature_importance(model, dataloader, criterion, device='cpu'):
model.eval()
original_loss = 0.0
feature_scores = []
# Calculate the original loss with unshuffled data
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
with torch.no_grad():
outputs = model(inputs)
loss = criterion(outputs, targets)
original_loss += loss.item()
original_loss /= len(dataloader)
# Iterate over all features
for i in range(dataloader.dataset.tensors[0].size(1)):
shuffled_dataloader = deepcopy(dataloader)
# Shuffle data for the current feature across all samples
idx = torch.randperm(shuffled_dataloader.dataset.tensors[0][:, i].size(0))
shuffled_dataloader.dataset.tensors[0][:, i] = shuffled_dataloader.dataset.tensors[0][idx, i]
shuffled_loss = 0.0
# Calculate loss with shuffled data
for inputs, targets in shuffled_dataloader:
inputs, targets = inputs.to(device), targets.to(device)
with torch.no_grad():
outputs = model(inputs)
loss = criterion(outputs, targets)
shuffled_loss += loss.item()
shuffled_loss /= len(shuffled_dataloader)
# The importance score could be the difference in loss
feature_scores.append(shuffled_loss - original_loss)
return feature_scores
# Example usage:
# Assuming `X_train` and `y_train` are your features and labels as torch tensors
dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Define your model, criterion, and optimizer
model = SimpleNN(input_size=X_train.size(1), output_size=y_train.size(1))
criterion = nn.MSELoss() # or any other suitable loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# You need to train your model here
# Compute feature importance
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
feature_scores = feature_importance(model, dataloader, criterion, device)
# Convert feature scores to pandas Series for better visualization
feature_importances = pd.Series(feature_scores, index=[f"Feature {i}" for i in range(X_train.size(1))])
print("Feature importances:\n", feature_importances.sort_values(ascending=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment