Tezan Sahu tezansahu

## mlf1_streamlit_roberta.py
import mlfoundry as mlf
import random
import os
from simpletransformers.classification import ClassificationModel

# Function to load a Simple Transformers model & predict sentiment
def predict_model(model_params, input_headline):
    try:
        class_name_map = {
            0: "negative",

## mlf1_run3.py
training_args = {
    'train_batch_size':8,
    'gradient_accumulation_steps':16,
    'learning_rate': 2e-5,
    'num_train_epochs': 5,
    'max_seq_length': 60               # based on the histogram of number of words present in the headline
}

# model_type and model_name have been changed to now fine-tune RoBERTa instead of BERT
model_params = {

## mlf1_run2.py
training_args = {
    'train_batch_size':8,
    'gradient_accumulation_steps':16,
    'learning_rate': 2e-5,
    'num_train_epochs': 5,             # Change in
    'max_seq_length': 60               # based on the histogram of number of words present in the headline
}

model_params = {
    'model_type': 'bert',

## mlf1_run1.py
training_args = {
    'train_batch_size':8,
    'gradient_accumulation_steps':16,
    'learning_rate': 2e-5,
    'num_train_epochs': 3,
    'max_seq_length': 60               # based on the histogram of number of words present in the headline
}

model_params = {
    'model_type': 'bert',

## mlf1_mlfInit.py
import mlfoundry as mlf

mlf_api = mlf.get_client()
mlf_run = mlf_api.create_run(project_name='financial-sentiment-analysis', run_name='bert_3epochs')

## mlf1_trainModel_mlf.py
def trainModel(model_params, training_args, run, run_name):
    # Log the training & evaluation datasets as CSV files
    run.log_dataset(train_df, data_slice=mlf.DataSlice.TRAIN, fileformat=mlf.FileFormat.CSV)
    run.log_dataset(eval_df, data_slice=mlf.DataSlice.TEST, fileformat=mlf.FileFormat.CSV)

    # Log the model specifications and the training hyperparameters as parameters for the run
    run.log_params({**model_params, **training_args})

    training_args['output_dir'] = os.path.join('outputs', run_name)
    training_args['overwrite_output_dir'] = True

## mlf1_trainModel_basic.py
def trainModel(model_params, training_args, run, run_name):
    training_args['output_dir'] = os.path.join('outputs', run_name)
    training_args['overwrite_output_dir'] = True

    model = ClassificationModel(
        model_params['model_type'],
        model_params['model_name'],
        num_labels=3,
        args=training_args
    )

## mlf1_datasetExploration.py
df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1')

# Inspect the distribution of the number of words in the headlines
# to figure out the max number of tokens to be used by the tokenizer
num_words = df["headline"].apply(lambda x: len(x.split()))
plt.hist(num_words)

# Create labels from the sentiment values
labels = {
    "negative": 0,

## mlf1_setup.py
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import torch

## inference.py
def loadAnswerSpace() -> List[str]:
    with open(os.path.join("dataset", "answer_space.txt")) as f:
        answer_space = f.read().splitlines()
    return answer_space


  def tokenizeQuestion(text_encoder, question, device) -> Dict:
    tokenizer = transformers.AutoTokenizer.from_pretrained(text_encoder)
    encoded_text = tokenizer(
        text=[question],
	import mlfoundry as mlf
	import random
	import os
	from simpletransformers.classification import ClassificationModel

	# Function to load a Simple Transformers model & predict sentiment
	def predict_model(model_params, input_headline):
	try:
	class_name_map = {
	0: "negative",
	training_args = {
	'train_batch_size':8,
	'gradient_accumulation_steps':16,
	'learning_rate': 2e-5,
	'num_train_epochs': 5,
	'max_seq_length': 60 # based on the histogram of number of words present in the headline
	}

	# model_type and model_name have been changed to now fine-tune RoBERTa instead of BERT
	model_params = {
	import mlfoundry as mlf

	mlf_api = mlf.get_client()
	mlf_run = mlf_api.create_run(project_name='financial-sentiment-analysis', run_name='bert_3epochs')
	def trainModel(model_params, training_args, run, run_name):
	# Log the training & evaluation datasets as CSV files
	run.log_dataset(train_df, data_slice=mlf.DataSlice.TRAIN, fileformat=mlf.FileFormat.CSV)
	run.log_dataset(eval_df, data_slice=mlf.DataSlice.TEST, fileformat=mlf.FileFormat.CSV)

	# Log the model specifications and the training hyperparameters as parameters for the run
	run.log_params({model_params, training_args})

	training_args['output_dir'] = os.path.join('outputs', run_name)
	training_args['overwrite_output_dir'] = True
	df = pd.read_csv("all-data.csv", header=None, names=["sentiment", "headline"], encoding = 'ISO-8859-1')

	# Inspect the distribution of the number of words in the headlines
	# to figure out the max number of tokens to be used by the tokenizer
	num_words = df["headline"].apply(lambda x: len(x.split()))
	plt.hist(num_words)

	# Create labels from the sentiment values
	labels = {
	"negative": 0,
	import os
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	%matplotlib inline

	from sklearn.model_selection import train_test_split
	from sklearn.metrics import f1_score, accuracy_score

	import torch
	def loadAnswerSpace() -> List[str]:
	with open(os.path.join("dataset", "answer_space.txt")) as f:
	answer_space = f.read().splitlines()
	return answer_space


	def tokenizeQuestion(text_encoder, question, device) -> Dict:
	tokenizer = transformers.AutoTokenizer.from_pretrained(text_encoder)
	encoded_text = tokenizer(
	text=[question],