Skip to content

Instantly share code, notes, and snippets.

@ajosh0504
Last active September 18, 2023 08:55
Show Gist options
  • Save ajosh0504/4560af91adb48212402300677cb65d4a to your computer and use it in GitHub Desktop.
Save ajosh0504/4560af91adb48212402300677cb65d4a to your computer and use it in GitHub Desktop.
Code snippets associated with the <BLOG_NAME> blog.
import elasticsearch import Elasticsearch
from pathlib import Path
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel
# Load the custom model
tm = TransformerModel("model", "text_classification")
# Export the model to a TorchScript representation which Elasticsearch uses
tmp_path = "models"
Path(tmp_path).mkdir(parents=True, exist_ok=True)
model_path, config, vocab_path = tm.save(tmp_path)
# Import model into Elasticsearch
es = Elasticsearch("ES_CLUSTER_URL", timeout=300)
ptm = PyTorchModel(es, tm.elasticsearch_model_id())
# You can also give the model a custom model id like
# ptm = PyTorchModel(es, "roberta_model")
ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
from datasets import load_dataset
from transformers import RobertaTokenizer
# Load local datasets
data_files = {"train": "data/train.csv", "test": "data/test.csv"}
data = load_dataset("csv", data_files=data_files)
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# Save the tokenizer for importing into Elastic later
tokenizer.save_pretrained("roberta_model")
# Tokenizer function
def tokenize_function(examples):
return tokenizer(
examples["concat"],
truncation=True,
max_length = 512,
padding=True)
# Tokenize data in batches
tokenized_data = data.map(tokenize_function, batched=True)
# Get tokenized train and test data
tokenized_train_data = tokenized_data["train"]
tokenized_test_data = tokenized_data["test"]
from datasets import load_metric
import numpy as np
from transformers import RobertaForSequenceClassification
from transformers import TrainingArguments
# Initialize the model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
# Define training arguments
training_args = TrainingArguments(output_dir="roberta_model", evaluation_strategy="epoch")
# Define metrics to track
accuracy = load_metric("accuracy")
recall = load_metric("recall")
precision = load_metric("precision")
f1 = load_metric("f1")
# Metric evaluation function
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
eval_accuracy = accuracy.compute(predictions=predictions, references=labels)
eval_recall = recall.compute(predictions=predictions, references=labels)
eval_precision = precision.compute(predictions=predictions, references=labels)
eval_f1 = f1.compute(predictions=predictions, references=labels)
return eval_accuracy, eval_recall, eval_precision, eval_f1
# Create the Trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment