Last active
April 18, 2021 14:38
-
-
Save manmohan24nov/c1588adaf995a98ddf0e660a6b5d8a6a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Recommended tensorflow version is <= 2.1.0, otherwise F1 score function breaks | |
import tensorflow as tf | |
from sklearn.metrics import f1_score | |
from sklearn.model_selection import train_test_split | |
import tensorflow_datasets as tfds | |
from transformers import TFRobertaForSequenceClassification | |
from transformers import RobertaTokenizer | |
# Load your Dataset | |
train_tweets = pd.read_csv('train_tweets.csv') | |
test_tweets = pd.read_csv('test_tweets.csv') | |
test_tweets['label'] = 0 | |
training_sentences, testing_sentences = train_test_split(train_tweets[['tweet', 'label']], | |
test_size=0.2) | |
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
# can be up to 512 for BERT | |
max_length = 512 | |
# the recommended batches size for BERT are 32,64 ... however on this dataset we are overfitting quite fast | |
# and smaller batches work like a regularization. | |
# You might play with adding another dropout layer instead. | |
batch_size = 64 | |
def convert_example_to_feature(review): | |
# combine step for tokenization, WordPiece vector mapping and will | |
# add also special tokens and truncate reviews longer than our max length | |
return roberta_tokenizer.encode_plus(review, | |
add_special_tokens=True, # add [CLS], [SEP] | |
max_length=max_length, # max length of the text that can go to RoBERTa | |
pad_to_max_length=True, # add [PAD] tokens at the end of sentence | |
return_attention_mask=True, # add attention mask to not focus on pad tokens | |
) | |
# map to the expected input to TFRobertaForSequenceClassification, see here | |
def map_example_to_dict(input_ids, attention_masks, label): | |
return { | |
"input_ids": input_ids, | |
"attention_mask": attention_masks, | |
}, label | |
def encode_examples(ds, limit=-1): | |
# Prepare Input list | |
input_ids_list = [] | |
attention_mask_list = [] | |
label_list = [] | |
if (limit > 0): | |
ds = ds.take(limit) | |
for review, label in tfds.as_numpy(ds): | |
bert_input = convert_example_to_feature(review.decode()) | |
input_ids_list.append(bert_input['input_ids']) | |
attention_mask_list.append(bert_input['attention_mask']) | |
label_list.append([label]) | |
return tf.data.Dataset.from_tensor_slices((input_ids_list, | |
attention_mask_list, | |
label_list)).map(map_example_to_dict) | |
training_sentences_modified = tf.data.Dataset.from_tensor_slices((training_sentences['tweet'], | |
training_sentences['label'])) | |
testing_sentences_modified = tf.data.Dataset.from_tensor_slices((testing_sentences['tweet'], | |
testing_sentences['label'])) | |
ds_train_encoded = encode_examples(training_sentences_modified).shuffle(10000).batch(batch_size) | |
ds_test_encoded = encode_examples(testing_sentences_modified).batch(batch_size) | |
learning_rate = 6e-5 | |
number_of_epochs = 6 | |
class ModelMetrics(tf.keras.callbacks.Callback): | |
def on_epoch_end(self, batch, logs={}): | |
y_val_pred = tf.nn.softmax(self.model.predict(ds_test_encoded)) | |
y_pred_argmax = tf.math.argmax(y_val_pred, axis=1) | |
testing_sentences['predicted'] = y_pred_argmax | |
f1_s = f1_score(testing_sentences['label'], testing_sentences['predicted']) | |
print('\n f1 score is :', f1_s) | |
metrics = ModelMetrics() | |
# model initialization | |
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base") | |
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08) | |
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy | |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) | |
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') | |
model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) | |
model.fit(ds_train_encoded, epochs=number_of_epochs, | |
validation_data=ds_test_encoded, callbacks=[metrics]) | |
# submittion file | |
# submission_sentences_modified = tf.data.Dataset.from_tensor_slices((test_tweets['tweet'], | |
# test_tweets['label'])) | |
# ds_submission_encoded = encode_examples(submission_sentences_modified).batch(batch_size) | |
# submission_pre = tf.nn.softmax(model.predict(ds_submission_encoded)) | |
# submission_pre_argmax = tf.math.argmax(submission_pre, axis=1) | |
# test_tweets['label'] = submission_pre_argmax | |
# test_tweets[['id', 'label']].to_csv('submission.csv', index=False, header=True) |
Hi Manmohan,
Could you please assist me,I am trying to install tensorflow, was successful in installing (1.12.0 version ) through command prompt in windows 10,but unfortunately that’s not recognized in pycharm community edition 2020.1.Hence tried to install it through Files>>settings>>Project-interpreter,but I am unable to do.Additionally tried with specific version,none of the above worked for me.
Additionally I am new to big data and machine learning would like to learn more about the approach you followed.
Thanks and regards
Pooja
Thank you so much for your response.I did succeed in installation.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
check python interpreter for pycharm and environment path in windows. They should be the same.
I have explained the approach in a medium article ( https://medium.com/analytics-vidhya/roberta-model-to-detect-hate-comments-on-social-media-f2db4e70fc4b).
Let me know if more explanation is required.