This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mlfoundry as mlf | |
import random | |
import os | |
from simpletransformers.classification import ClassificationModel | |
# Function to load a Simple Transformers model & predict sentiment | |
def predict_model(model_params, input_headline): | |
try: | |
class_name_map = { | |
0: "negative", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
training_args = { | |
'train_batch_size':8, | |
'gradient_accumulation_steps':16, | |
'learning_rate': 2e-5, | |
'num_train_epochs': 5, | |
'max_seq_length': 60 # based on the histogram of number of words present in the headline | |
} | |
# model_type and model_name have been changed to now fine-tune RoBERTa instead of BERT | |
model_params = { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
training_args = { | |
'train_batch_size':8, | |
'gradient_accumulation_steps':16, | |
'learning_rate': 2e-5, | |
'num_train_epochs': 5, # Change in | |
'max_seq_length': 60 # based on the histogram of number of words present in the headline | |
} | |
model_params = { | |
'model_type': 'bert', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
training_args = { | |
'train_batch_size':8, | |
'gradient_accumulation_steps':16, | |
'learning_rate': 2e-5, | |
'num_train_epochs': 3, | |
'max_seq_length': 60 # based on the histogram of number of words present in the headline | |
} | |
model_params = { | |
'model_type': 'bert', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mlfoundry as mlf | |
mlf_api = mlf.get_client() | |
mlf_run = mlf_api.create_run(project_name='financial-sentiment-analysis', run_name='bert_3epochs') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def trainModel(model_params, training_args, run, run_name): | |
# Log the training & evaluation datasets as CSV files | |
run.log_dataset(train_df, data_slice=mlf.DataSlice.TRAIN, fileformat=mlf.FileFormat.CSV) | |
run.log_dataset(eval_df, data_slice=mlf.DataSlice.TEST, fileformat=mlf.FileFormat.CSV) | |
# Log the model specifications and the training hyperparameters as parameters for the run | |
run.log_params({**model_params, **training_args}) | |
training_args['output_dir'] = os.path.join('outputs', run_name) | |
training_args['overwrite_output_dir'] = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def trainModel(model_params, training_args, run, run_name): | |
training_args['output_dir'] = os.path.join('outputs', run_name) | |
training_args['overwrite_output_dir'] = True | |
model = ClassificationModel( | |
model_params['model_type'], | |
model_params['model_name'], | |
num_labels=3, | |
args=training_args | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1') | |
# Inspect the distribution of the number of words in the headlines | |
# to figure out the max number of tokens to be used by the tokenizer | |
num_words = df["headline"].apply(lambda x: len(x.split())) | |
plt.hist(num_words) | |
# Create labels from the sentiment values | |
labels = { | |
"negative": 0, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import f1_score, accuracy_score | |
import torch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def loadAnswerSpace() -> List[str]: | |
with open(os.path.join("dataset", "answer_space.txt")) as f: | |
answer_space = f.read().splitlines() | |
return answer_space | |
def tokenizeQuestion(text_encoder, question, device) -> Dict: | |
tokenizer = transformers.AutoTokenizer.from_pretrained(text_encoder) | |
encoded_text = tokenizer( | |
text=[question], |