Skip to content

Instantly share code, notes, and snippets.

View tezansahu's full-sized avatar
🎯
Focusing

Tezan Sahu tezansahu

🎯
Focusing
View GitHub Profile
import mlfoundry as mlf
import random
import os
from simpletransformers.classification import ClassificationModel
# Function to load a Simple Transformers model & predict sentiment
def predict_model(model_params, input_headline):
try:
class_name_map = {
0: "negative",
training_args = {
'train_batch_size':8,
'gradient_accumulation_steps':16,
'learning_rate': 2e-5,
'num_train_epochs': 5,
'max_seq_length': 60 # based on the histogram of number of words present in the headline
}
# model_type and model_name have been changed to now fine-tune RoBERTa instead of BERT
model_params = {
training_args = {
'train_batch_size':8,
'gradient_accumulation_steps':16,
'learning_rate': 2e-5,
'num_train_epochs': 5, # Change in
'max_seq_length': 60 # based on the histogram of number of words present in the headline
}
model_params = {
'model_type': 'bert',
training_args = {
'train_batch_size':8,
'gradient_accumulation_steps':16,
'learning_rate': 2e-5,
'num_train_epochs': 3,
'max_seq_length': 60 # based on the histogram of number of words present in the headline
}
model_params = {
'model_type': 'bert',
import mlfoundry as mlf
mlf_api = mlf.get_client()
mlf_run = mlf_api.create_run(project_name='financial-sentiment-analysis', run_name='bert_3epochs')
def trainModel(model_params, training_args, run, run_name):
# Log the training & evaluation datasets as CSV files
run.log_dataset(train_df, data_slice=mlf.DataSlice.TRAIN, fileformat=mlf.FileFormat.CSV)
run.log_dataset(eval_df, data_slice=mlf.DataSlice.TEST, fileformat=mlf.FileFormat.CSV)
# Log the model specifications and the training hyperparameters as parameters for the run
run.log_params({**model_params, **training_args})
training_args['output_dir'] = os.path.join('outputs', run_name)
training_args['overwrite_output_dir'] = True
def trainModel(model_params, training_args, run, run_name):
training_args['output_dir'] = os.path.join('outputs', run_name)
training_args['overwrite_output_dir'] = True
model = ClassificationModel(
model_params['model_type'],
model_params['model_name'],
num_labels=3,
args=training_args
)
df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1')
# Inspect the distribution of the number of words in the headlines
# to figure out the max number of tokens to be used by the tokenizer
num_words = df["headline"].apply(lambda x: len(x.split()))
plt.hist(num_words)
# Create labels from the sentiment values
labels = {
"negative": 0,
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import torch
def loadAnswerSpace() -> List[str]:
with open(os.path.join("dataset", "answer_space.txt")) as f:
answer_space = f.read().splitlines()
return answer_space
def tokenizeQuestion(text_encoder, question, device) -> Dict:
tokenizer = transformers.AutoTokenizer.from_pretrained(text_encoder)
encoded_text = tokenizer(
text=[question],