This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # testing on unseen data | |
| unseen_news_text = ["Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing", # Fake | |
| "WATCH: George W. Bush Calls Out Trump For Supporting White Supremacy", # Fake | |
| "U.S. lawmakers question businessman at 2016 Trump Tower meeting: sources", # True | |
| "Trump administration issues new rules on U.S. visa waivers" # True | |
| ] | |
| # tokenize and encode sequences in the test set | |
| MAX_LENGHT = 15 | |
| tokens_unseen = tokenizer.batch_encode_plus( | |
| unseen_news_text, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # load weights of best model | |
| path = 'c1_fakenews_weights.pt' | |
| model.load_state_dict(torch.load(path)) | |
| with torch.no_grad(): | |
| preds = model(test_seq, test_mask) | |
| preds = preds.detach().cpu().numpy() | |
| preds = np.argmax(preds, axis = 1) | |
| print(classification_report(test_y, preds)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Train and predict | |
| best_valid_loss = float('inf') | |
| train_losses=[] # empty lists to store training and validation loss of each epoch | |
| valid_losses=[] | |
| for epoch in range(epochs): | |
| print('\n Epoch {:} / {:}'.format(epoch + 1, epochs)) | |
| train_loss = train() # train model | |
| valid_loss = evaluate() # evaluate model | |
| if valid_loss < best_valid_loss: # save the best model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Defining training and evaluation functions | |
| def train(): | |
| model.train() | |
| total_loss, total_accuracy = 0, 0 | |
| for step,batch in enumerate(train_dataloader): # iterate over batches | |
| if step % 50 == 0 and not step == 0: # progress update after every 50 batches. | |
| print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader))) | |
| batch = [r for r in batch] # push the batch to gpu | |
| sent_id, mask, labels = batch |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class BERT_Arch(nn.Module): | |
| def __init__(self, bert): | |
| super(BERT_Arch, self).__init__() | |
| self.bert = bert | |
| self.dropout = nn.Dropout(0.1) # dropout layer | |
| self.relu = nn.ReLU() # relu activation function | |
| self.fc1 = nn.Linear(768,512) # dense layer 1 | |
| self.fc2 = nn.Linear(512,2) # dense layer 2 (Output layer) | |
| self.softmax = nn.LogSoftmax(dim=1) # softmax activation function | |
| def forward(self, sent_id, mask): # define the forward pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Freezing the parameters and defining trainable BERT structure | |
| for param in bert.parameters(): | |
| param.requires_grad = False # false here means gradient need not be computed |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Majority of titles above have word length under 15. So, we set max title length as 15 | |
| MAX_LENGHT = 15 | |
| # Tokenize and encode sequences in the train set | |
| tokens_train = tokenizer.batch_encode_plus( | |
| train_text.tolist(), | |
| max_length = MAX_LENGHT, | |
| pad_to_max_length=True, | |
| truncation=True | |
| ) | |
| # tokenize and encode sequences in the validation set |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Load BERT model and tokenizer via HuggingFace Transformers | |
| bert = AutoModel.from_pretrained('bert-base-uncased') | |
| tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
| # Plot histogram of the number of words in train data 'title' | |
| seq_len = [len(title.split()) for title in train_text] | |
| pd.Series(seq_len).hist(bins = 40,color='firebrick') | |
| plt.xlabel('Number of Words') | |
| plt.ylabel('Number of texts') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Train-Validation-Test set split into 70:15:15 ratio | |
| # Train-Temp split | |
| train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['label'], | |
| random_state=2018, | |
| test_size=0.3, | |
| stratify=data['Target']) | |
| # Validation-Test split | |
| val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, | |
| random_state=2018, | |
| test_size=0.5, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Load Dataset | |
| true_data = pd.read_csv('a1_True.csv') | |
| fake_data = pd.read_csv('a2_Fake.csv') | |
| # Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data' | |
| true_data['Target'] = ['True']*len(true_data) | |
| fake_data['Target'] = ['Fake']*len(fake_data) | |
| # Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data' | |
| data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index']) | |
| # Target column is made of string values True/Fake, let's change it to numbers 0/1 (Fake=1) | |
| data['label'] = pd.get_dummies(data.Target)['Fake'] |
NewerOlder