-
-
Save ravindu9701/1a5451fd79f633727ac1c636cb415892 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np \nimport pandas as pd \nimport tensorflow as tf\nimport sklearn\nfrom tqdm import tqdm\ndf=pd.read_csv(\"IMDB Dataset.csv\")\ndf.sample()","metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","execution":{"iopub.status.busy":"2021-11-24T09:18:26.473813Z","iopub.execute_input":"2021-11-24T09:18:26.474200Z","iopub.status.idle":"2021-11-24T09:18:32.988846Z","shell.execute_reply.started":"2021-11-24T09:18:26.474162Z","shell.execute_reply":"2021-11-24T09:18:32.988066Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"!pip install transformers ","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:18:32.991040Z","iopub.execute_input":"2021-11-24T09:18:32.991310Z","iopub.status.idle":"2021-11-24T09:18:40.637200Z","shell.execute_reply.started":"2021-11-24T09:18:32.991286Z","shell.execute_reply":"2021-11-24T09:18:40.636246Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"# Loading the BERT Classifier and Tokenizer along with Input module\nfrom transformers import BertTokenizer, TFBertForSequenceClassification\nfrom transformers import InputExample, InputFeatures\n\nmodel = TFBertForSequenceClassification.from_pretrained(\"bert-base-uncased\")\ntokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:18:40.639001Z","iopub.execute_input":"2021-11-24T09:18:40.639369Z","iopub.status.idle":"2021-11-24T09:19:16.203447Z","shell.execute_reply.started":"2021-11-24T09:18:40.639322Z","shell.execute_reply":"2021-11-24T09:19:16.202344Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.207391Z","iopub.execute_input":"2021-11-24T09:19:16.207889Z","iopub.status.idle":"2021-11-24T09:19:16.242289Z","shell.execute_reply.started":"2021-11-24T09:19:16.207841Z","shell.execute_reply":"2021-11-24T09:19:16.241468Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"# changing positive and negative into numeric values\n\ndef cat2num(value):\n if value=='positive': \n return 1\n else: \n return 0\n \ndf['sentiment'] = df['sentiment'].apply(cat2num)\ntrain = df[:45000]\ntest = df[45000:]","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.245392Z","iopub.execute_input":"2021-11-24T09:19:16.245802Z","iopub.status.idle":"2021-11-24T09:19:16.304891Z","shell.execute_reply.started":"2021-11-24T09:19:16.245700Z","shell.execute_reply":"2021-11-24T09:19:16.304008Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"example = 'This is a blog post on how to do sentiment analysis with BERT'\ntokens=tokenizer.tokenize(example)\ntoken_ids = tokenizer.convert_tokens_to_ids(tokens)\nprint(tokens)\nprint(token_ids)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.306643Z","iopub.execute_input":"2021-11-24T09:19:16.307041Z","iopub.status.idle":"2021-11-24T09:19:16.314362Z","shell.execute_reply.started":"2021-11-24T09:19:16.307002Z","shell.execute_reply":"2021-11-24T09:19:16.313174Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"def convert2inputexamples(train, test, review, sentiment): \n trainexamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n text_a = x[review], \n label = x[sentiment]), axis = 1)\n\n validexamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n text_a = x[review], \n label = x[sentiment]), axis = 1,)\n \n return trainexamples, validexamples\n\ntrainexamples, validexamples = convert2inputexamples(train, test, 'review', 'sentiment')\n ","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:16.316436Z","iopub.execute_input":"2021-11-24T09:19:16.317082Z","iopub.status.idle":"2021-11-24T09:19:17.447058Z","shell.execute_reply.started":"2021-11-24T09:19:16.317041Z","shell.execute_reply":"2021-11-24T09:19:17.445822Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"trainexamples[0]","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:17.448565Z","iopub.execute_input":"2021-11-24T09:19:17.449174Z","iopub.status.idle":"2021-11-24T09:19:17.461820Z","shell.execute_reply.started":"2021-11-24T09:19:17.449134Z","shell.execute_reply":"2021-11-24T09:19:17.461016Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"def convertexamples2tf(examples, tokenizer, max_length=128):\n features = []\n\n for i in tqdm(examples):\n input_dict = tokenizer.encode_plus(\n i.text_a,\n add_special_tokens=True, # Add 'CLS' and 'SEP'\n max_length=max_length, # truncates if len(s) > max_length\n return_token_type_ids=True,\n return_attention_mask=True,\n pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length\n truncation=True\n )\n\n input_ids, token_type_ids, attention_mask = (input_dict[\"input_ids\"],input_dict[\"token_type_ids\"], input_dict['attention_mask'])\n features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=i.label) )\n\n def generate():\n for f in features:\n yield (\n {\n \"input_ids\": f.input_ids,\n \"attention_mask\": f.attention_mask,\n \"token_type_ids\": f.token_type_ids,\n },\n f.label,\n )\n\n return tf.data.Dataset.from_generator(\n generate,\n ({\"input_ids\": tf.int32, \"attention_mask\": tf.int32, \"token_type_ids\": tf.int32}, tf.int64),\n (\n {\n \"input_ids\": tf.TensorShape([None]),\n \"attention_mask\": tf.TensorShape([None]),\n \"token_type_ids\": tf.TensorShape([None]),\n },\n tf.TensorShape([]),\n ),\n )\n\n\nDATA_COLUMN = 'review'\nLABEL_COLUMN = 'sentiment'","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:17.463428Z","iopub.execute_input":"2021-11-24T09:19:17.464039Z","iopub.status.idle":"2021-11-24T09:19:17.890967Z","shell.execute_reply.started":"2021-11-24T09:19:17.463998Z","shell.execute_reply":"2021-11-24T09:19:17.889772Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"train_data = convertexamples2tf(list(trainexamples), tokenizer)\ntrain_data = train_data.shuffle(100).batch(32).repeat(2)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:19:17.895236Z","iopub.execute_input":"2021-11-24T09:19:17.897329Z","iopub.status.idle":"2021-11-24T09:25:22.844912Z","shell.execute_reply.started":"2021-11-24T09:19:17.896795Z","shell.execute_reply":"2021-11-24T09:25:22.844056Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"validation_data = convertexamples2tf(list(validexamples), tokenizer)\nvalidation_data = validation_data.batch(32)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:25:22.846332Z","iopub.execute_input":"2021-11-24T09:25:22.846683Z","iopub.status.idle":"2021-11-24T09:26:02.478966Z","shell.execute_reply.started":"2021-11-24T09:25:22.846647Z","shell.execute_reply":"2021-11-24T09:26:02.477892Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), \n loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \n metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])\n\nmodel.fit(train_data, epochs=2, validation_data=validation_data)","metadata":{"execution":{"iopub.status.busy":"2021-11-24T09:26:02.488308Z","iopub.execute_input":"2021-11-24T09:26:02.488735Z","iopub.status.idle":"2021-11-24T10:12:39.897840Z","shell.execute_reply.started":"2021-11-24T09:26:02.488698Z","shell.execute_reply":"2021-11-24T10:12:39.897102Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"sentences = ['This was a good movie. I would watch it again','I cannot believe I have wasted time on this movie, it is the worst movie I have ever seen']","metadata":{"execution":{"iopub.status.busy":"2021-11-24T10:15:54.649042Z","iopub.execute_input":"2021-11-24T10:15:54.649373Z","iopub.status.idle":"2021-11-24T10:15:54.653526Z","shell.execute_reply.started":"2021-11-24T10:15:54.649344Z","shell.execute_reply":"2021-11-24T10:15:54.652644Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"tf_batch = tokenizer(sentences, max_length=128, padding=True, truncation=True, return_tensors='tf') # we are tokenizing before sending into our trained model\ntf_outputs = model(tf_batch) \ntf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.\nlabels = ['Negative','Positive']\nlabel = tf.argmax(tf_predictions, axis=1)\nlabel = label.numpy()\nfor i in range(len(sentences)):\n print(sentences[i], \": \", labels[label[i]])","metadata":{"execution":{"iopub.status.busy":"2021-11-24T10:15:56.402504Z","iopub.execute_input":"2021-11-24T10:15:56.402829Z","iopub.status.idle":"2021-11-24T10:15:56.500207Z","shell.execute_reply.started":"2021-11-24T10:15:56.402799Z","shell.execute_reply":"2021-11-24T10:15:56.499454Z"},"trusted":true},"execution_count":23,"outputs":[]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment