Skip to content

Instantly share code, notes, and snippets.

@manmohan24nov
Last active April 18, 2021 14:38
Show Gist options
  • Save manmohan24nov/c1588adaf995a98ddf0e660a6b5d8a6a to your computer and use it in GitHub Desktop.
Save manmohan24nov/c1588adaf995a98ddf0e660a6b5d8a6a to your computer and use it in GitHub Desktop.
import pandas as pd
# Recommended tensorflow version is <= 2.1.0, otherwise F1 score function breaks
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
from transformers import TFRobertaForSequenceClassification
from transformers import RobertaTokenizer
# Load your Dataset
train_tweets = pd.read_csv('train_tweets.csv')
test_tweets = pd.read_csv('test_tweets.csv')
test_tweets['label'] = 0
training_sentences, testing_sentences = train_test_split(train_tweets[['tweet', 'label']],
test_size=0.2)
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# can be up to 512 for BERT
max_length = 512
# the recommended batches size for BERT are 32,64 ... however on this dataset we are overfitting quite fast
# and smaller batches work like a regularization.
# You might play with adding another dropout layer instead.
batch_size = 64
def convert_example_to_feature(review):
# combine step for tokenization, WordPiece vector mapping and will
# add also special tokens and truncate reviews longer than our max length
return roberta_tokenizer.encode_plus(review,
add_special_tokens=True, # add [CLS], [SEP]
max_length=max_length, # max length of the text that can go to RoBERTa
pad_to_max_length=True, # add [PAD] tokens at the end of sentence
return_attention_mask=True, # add attention mask to not focus on pad tokens
)
# map to the expected input to TFRobertaForSequenceClassification, see here
def map_example_to_dict(input_ids, attention_masks, label):
return {
"input_ids": input_ids,
"attention_mask": attention_masks,
}, label
def encode_examples(ds, limit=-1):
# Prepare Input list
input_ids_list = []
attention_mask_list = []
label_list = []
if (limit > 0):
ds = ds.take(limit)
for review, label in tfds.as_numpy(ds):
bert_input = convert_example_to_feature(review.decode())
input_ids_list.append(bert_input['input_ids'])
attention_mask_list.append(bert_input['attention_mask'])
label_list.append([label])
return tf.data.Dataset.from_tensor_slices((input_ids_list,
attention_mask_list,
label_list)).map(map_example_to_dict)
training_sentences_modified = tf.data.Dataset.from_tensor_slices((training_sentences['tweet'],
training_sentences['label']))
testing_sentences_modified = tf.data.Dataset.from_tensor_slices((testing_sentences['tweet'],
testing_sentences['label']))
ds_train_encoded = encode_examples(training_sentences_modified).shuffle(10000).batch(batch_size)
ds_test_encoded = encode_examples(testing_sentences_modified).batch(batch_size)
learning_rate = 6e-5
number_of_epochs = 6
class ModelMetrics(tf.keras.callbacks.Callback):
def on_epoch_end(self, batch, logs={}):
y_val_pred = tf.nn.softmax(self.model.predict(ds_test_encoded))
y_pred_argmax = tf.math.argmax(y_val_pred, axis=1)
testing_sentences['predicted'] = y_pred_argmax
f1_s = f1_score(testing_sentences['label'], testing_sentences['predicted'])
print('\n f1 score is :', f1_s)
metrics = ModelMetrics()
# model initialization
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(ds_train_encoded, epochs=number_of_epochs,
validation_data=ds_test_encoded, callbacks=[metrics])
# submittion file
# submission_sentences_modified = tf.data.Dataset.from_tensor_slices((test_tweets['tweet'],
# test_tweets['label']))
# ds_submission_encoded = encode_examples(submission_sentences_modified).batch(batch_size)
# submission_pre = tf.nn.softmax(model.predict(ds_submission_encoded))
# submission_pre_argmax = tf.math.argmax(submission_pre, axis=1)
# test_tweets['label'] = submission_pre_argmax
# test_tweets[['id', 'label']].to_csv('submission.csv', index=False, header=True)
@poojapawar09
Copy link

Hi Manmohan,

Could you please assist me,I am trying to install tensorflow, was successful in installing (1.12.0 version ) through command prompt in windows 10,but unfortunately that’s not recognized in pycharm community edition 2020.1.Hence tried to install it through Files>>settings>>Project-interpreter,but I am unable to do.Additionally tried with specific version,none of the above worked for me.

Additionally I am new to big data and machine learning would like to learn more about the approach you followed.
Thanks and regards
Pooja

@manmohan24nov
Copy link
Author

manmohan24nov commented Jun 18, 2020

check python interpreter for pycharm and environment path in windows. They should be the same.

I have explained the approach in a medium article ( https://medium.com/analytics-vidhya/roberta-model-to-detect-hate-comments-on-social-media-f2db4e70fc4b).

Let me know if more explanation is required.

@poojapawar09
Copy link

Hi Manmohan,

Could you please assist me,I am trying to install tensorflow, was successful in installing (1.12.0 version ) through command prompt in windows 10,but unfortunately that’s not recognized in pycharm community edition 2020.1.Hence tried to install it through Files>>settings>>Project-interpreter,but I am unable to do.Additionally tried with specific version,none of the above worked for me.

Additionally I am new to big data and machine learning would like to learn more about the approach you followed.
Thanks and regards
Pooja

Thank you so much for your response.I did succeed in installation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment