ravindu9701/BERT Sentiment Analysis.ipynb Secret

## BERT Sentiment Analysis.ipynb
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np \nimport pandas as pd \nimport tensorflow as tf\nimport sklearn\nfrom tqdm import tqdm\ndf=pd.read_csv(\"IMDB Dataset.csv\")\ndf.sample()","metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","execution":{"iopub.status.busy":"2021-11-24T09:18:26.473813Z","iopub.execute_input":"2021-11-24T09:18:26.474200Z","iopub.status.idle":"2021-11-24T09:18:32.988846Z","shell.execute_reply.started":"2021-11-24T09:18:26.474162Z","shell.execute_reply":"2021-11-24T09:18:32.988066Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"!pip install transformers ","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:18:32.991040Z","iopub.execute_input":"2021-11-24T09:18:32.991310Z","iopub.status.idle":"2021-11-24T09:18:40.637200Z","shell.execute_reply.started":"2021-11-24T09:18:32.991286Z","shell.execute_reply":"2021-11-24T09:18:40.636246Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"# Loading the BERT Classifier and Tokenizer along with Input module\nfrom transformers import BertTokenizer, TFBertForSequenceClassification\nfrom transformers import InputExample, InputFeatures\n\nmodel = TFBertForSequenceClassification.from_pretrained(\"bert-base-uncased\")\ntokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:18:40.639001Z","iopub.execute_input":"2021-11-24T09:18:40.639369Z","iopub.status.idle":"2021-11-24T09:19:16.203447Z","shell.execute_reply.started":"2021-11-24T09:18:40.639322Z","shell.execute_reply":"2021-11-24T09:19:16.202344Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.207391Z","iopub.execute_input":"2021-11-24T09:19:16.207889Z","iopub.status.idle":"2021-11-24T09:19:16.242289Z","shell.execute_reply.started":"2021-11-24T09:19:16.207841Z","shell.execute_reply":"2021-11-24T09:19:16.241468Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"# changing positive and negative into numeric values\n\ndef cat2num(value):\n    if value=='positive': \n        return 1\n    else: \n        return 0\n    \ndf['sentiment']  =  df['sentiment'].apply(cat2num)\ntrain = df[:45000]\ntest = df[45000:]","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.245392Z","iopub.execute_input":"2021-11-24T09:19:16.245802Z","iopub.status.idle":"2021-11-24T09:19:16.304891Z","shell.execute_reply.started":"2021-11-24T09:19:16.245700Z","shell.execute_reply":"2021-11-24T09:19:16.304008Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"example = 'This is a blog post on how to do sentiment analysis with BERT'\ntokens=tokenizer.tokenize(example)\ntoken_ids = tokenizer.convert_tokens_to_ids(tokens)\nprint(tokens)\nprint(token_ids)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.306643Z","iopub.execute_input":"2021-11-24T09:19:16.307041Z","iopub.status.idle":"2021-11-24T09:19:16.314362Z","shell.execute_reply.started":"2021-11-24T09:19:16.307002Z","shell.execute_reply":"2021-11-24T09:19:16.313174Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"def convert2inputexamples(train, test, review, sentiment): \n    trainexamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n                                                          text_a = x[review], \n                                                          label = x[sentiment]), axis = 1)\n\n    validexamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n                                                          text_a = x[review], \n                                                          label = x[sentiment]), axis = 1,)\n  \n    return trainexamples, validexamples\n\ntrainexamples, validexamples = convert2inputexamples(train,  test, 'review',  'sentiment')\n                                                                         ","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.316436Z","iopub.execute_input":"2021-11-24T09:19:16.317082Z","iopub.status.idle":"2021-11-24T09:19:17.447058Z","shell.execute_reply.started":"2021-11-24T09:19:16.317041Z","shell.execute_reply":"2021-11-24T09:19:17.445822Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"trainexamples[0]","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:17.448565Z","iopub.execute_input":"2021-11-24T09:19:17.449174Z","iopub.status.idle":"2021-11-24T09:19:17.461820Z","shell.execute_reply.started":"2021-11-24T09:19:17.449134Z","shell.execute_reply":"2021-11-24T09:19:17.461016Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"def convertexamples2tf(examples, tokenizer, max_length=128):\n    features = []\n\n    for i in tqdm(examples):\n        input_dict = tokenizer.encode_plus(\n            i.text_a,\n            add_special_tokens=True,    # Add 'CLS' and 'SEP'\n            max_length=max_length,    # truncates if len(s) > max_length\n            return_token_type_ids=True,\n            return_attention_mask=True,\n            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length\n            truncation=True\n        )\n\n        input_ids, token_type_ids, attention_mask = (input_dict[\"input_ids\"],input_dict[\"token_type_ids\"], input_dict['attention_mask'])\n        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=i.label) )\n\n    def generate():\n        for f in features:\n            yield (\n                {\n                    \"input_ids\": f.input_ids,\n                    \"attention_mask\": f.attention_mask,\n                    \"token_type_ids\": f.token_type_ids,\n                },\n                f.label,\n            )\n\n    return tf.data.Dataset.from_generator(\n        generate,\n        ({\"input_ids\": tf.int32, \"attention_mask\": tf.int32, \"token_type_ids\": tf.int32}, tf.int64),\n        (\n            {\n                \"input_ids\": tf.TensorShape([None]),\n                \"attention_mask\": tf.TensorShape([None]),\n                \"token_type_ids\": tf.TensorShape([None]),\n            },\n            tf.TensorShape([]),\n        ),\n    )\n\n\nDATA_COLUMN = 'review'\nLABEL_COLUMN = 'sentiment'","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:17.463428Z","iopub.execute_input":"2021-11-24T09:19:17.464039Z","iopub.status.idle":"2021-11-24T09:19:17.890967Z","shell.execute_reply.started":"2021-11-24T09:19:17.463998Z","shell.execute_reply":"2021-11-24T09:19:17.889772Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"train_data = convertexamples2tf(list(trainexamples), tokenizer)\ntrain_data = train_data.shuffle(100).batch(32).repeat(2)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:17.895236Z","iopub.execute_input":"2021-11-24T09:19:17.897329Z","iopub.status.idle":"2021-11-24T09:25:22.844912Z","shell.execute_reply.started":"2021-11-24T09:19:17.896795Z","shell.execute_reply":"2021-11-24T09:25:22.844056Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"validation_data = convertexamples2tf(list(validexamples), tokenizer)\nvalidation_data = validation_data.batch(32)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:25:22.846332Z","iopub.execute_input":"2021-11-24T09:25:22.846683Z","iopub.status.idle":"2021-11-24T09:26:02.478966Z","shell.execute_reply.started":"2021-11-24T09:25:22.846647Z","shell.execute_reply":"2021-11-24T09:26:02.477892Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), \n              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \n              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])\n\nmodel.fit(train_data, epochs=2, validation_data=validation_data)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:26:02.488308Z","iopub.execute_input":"2021-11-24T09:26:02.488735Z","iopub.status.idle":"2021-11-24T10:12:39.897840Z","shell.execute_reply.started":"2021-11-24T09:26:02.488698Z","shell.execute_reply":"2021-11-24T10:12:39.897102Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"sentences = ['This was a good movie. I would watch it again','I cannot believe I have wasted time on this movie, it is the worst movie I have ever seen']","metadata":{"execution":{"iopub.status.busy":"2021-11-24T10:15:54.649042Z","iopub.execute_input":"2021-11-24T10:15:54.649373Z","iopub.status.idle":"2021-11-24T10:15:54.653526Z","shell.execute_reply.started":"2021-11-24T10:15:54.649344Z","shell.execute_reply":"2021-11-24T10:15:54.652644Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"tf_batch = tokenizer(sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')   # we are tokenizing before sending into our trained model\ntf_outputs = model(tf_batch)                                  \ntf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)       # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.\nlabels = ['Negative','Positive']\nlabel = tf.argmax(tf_predictions, axis=1)\nlabel = label.numpy()\nfor i in range(len(sentences)):\n    print(sentences[i], \": \", labels[label[i]])","metadata":{"execution":{"iopub.status.busy":"2021-11-24T10:15:56.402504Z","iopub.execute_input":"2021-11-24T10:15:56.402829Z","iopub.status.idle":"2021-11-24T10:15:56.500207Z","shell.execute_reply.started":"2021-11-24T10:15:56.402799Z","shell.execute_reply":"2021-11-24T10:15:56.499454Z"},"trusted":true},"execution_count":23,"outputs":[]}]}