Skip to content

Instantly share code, notes, and snippets.

@M0nteCarl0
Last active October 9, 2023 05:32
Show Gist options
  • Save M0nteCarl0/2641f80c2285db585f51dfdbfee9a1a8 to your computer and use it in GitHub Desktop.
Save M0nteCarl0/2641f80c2285db585f51dfdbfee9a1a8 to your computer and use it in GitHub Desktop.
Yndex Catboost summary cheatshit
from catboost import CatBoostClassifier, Pool
# Create a CatBoostClassifier model
model = CatBoostClassifier()
# Load your data into a Pool object
train_data = Pool(X_train, y_train)
# Train the model
model.fit(train_data)
# Make predictions on the test set
preds = model.predict(X_test)
# Get all metrics
metrics = model.get_multiclass_metrics(train_data, ['Accuracy', 'Precision', 'Recall', 'F1', 'Logloss'])
# Print the metrics
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
from catboost import CatBoostClassifier
from dask_ml.model_selection import train_test_split
from dask.distributed import Client, wait
import dask.dataframe as dd
# Start a Dask cluster with multiple workers
client = Client(n_workers=4)
# Read the data into a Dask DataFrame
data = dd.read_csv('data.csv')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
data.drop('target', axis=1),
data['target'],
test_size=0.2,
random_state=42
)
# Convert the Dask DataFrame to Dask arrays
X_train = X_train.to_dask_array(lengths=True)
y_train = y_train.to_dask_array(lengths=True)
# Scatter the training data to distribute it among workers
X_train = client.scatter(X_train)
y_train = client.scatter(y_train)
# Define a function to train the CatBoost model on a single worker
def train_model(X, y):
model = CatBoostClassifier(iterations=100, learning_rate=0.1)
model.fit(X, y)
return model
# Train the model on each worker
futures = client.map(train_model, [X_train] * client.n_workers, [y_train] * client.n_workers)
wait(futures)
# Collect the trained models from the workers
models = client.gather(futures)
# Convert the test data to Dask arrays
X_test = X_test.to_dask_array(lengths=True)
y_test = y_test.to_dask_array(lengths=True)
# Predict on the test data using each model
y_preds = [model.predict(X_test) for model in models]
# Compute the accuracy for each model
accuracies = [(y_pred == y_test).mean().compute() for y_pred in y_preds]
# Print the accuracies
for i, accuracy in enumerate(accuracies):
print(f"Accuracy of Model {i+1}: {accuracy}")
from catboost import CatBoostClassifier
# Assuming you have a trained CatBoostClassifier model
model = CatBoostClassifier()
model.load_model('model.cbm')
# Splitting the model into multiple files
model.save_model('model_part1.cbm', format='cbm', split_count=2)
model.save_model('model_part2.cbm', format='cbm', split_count=2)
from catboost import CatBoostClassifier
# Rest of the code...
# Save the trained models into a single CBM file
combined_model = CatBoostClassifier()
combined_model.combine_models(models, inplace=True)
combined_model.save_model('combined_model.cbm', format='cbm')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment