Skip to content

Instantly share code, notes, and snippets.

View skillcate's full-sized avatar

Gopal Sharma skillcate

View GitHub Profile
@skillcate
skillcate / fakenews_pred.py
Last active October 2, 2022 11:00
fakenews_pred
# testing on unseen data
unseen_news_text = ["Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing", # Fake
"WATCH: George W. Bush Calls Out Trump For Supporting White Supremacy", # Fake
"U.S. lawmakers question businessman at 2016 Trump Tower meeting: sources", # True
"Trump administration issues new rules on U.S. visa waivers" # True
]
# tokenize and encode sequences in the test set
MAX_LENGHT = 15
tokens_unseen = tokenizer.batch_encode_plus(
unseen_news_text,
@skillcate
skillcate / fakenews_perf.py
Created October 2, 2022 10:58
fakenews_perf
# load weights of best model
path = 'c1_fakenews_weights.pt'
model.load_state_dict(torch.load(path))
with torch.no_grad():
preds = model(test_seq, test_mask)
preds = preds.detach().cpu().numpy()
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))
@skillcate
skillcate / fakenews_training.py
Created October 2, 2022 10:56
fakenews_training
# Train and predict
best_valid_loss = float('inf')
train_losses=[] # empty lists to store training and validation loss of each epoch
valid_losses=[]
for epoch in range(epochs):
print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
train_loss = train() # train model
valid_loss = evaluate() # evaluate model
if valid_loss < best_valid_loss: # save the best model
@skillcate
skillcate / fakenews_train_evaluate_fn.py
Created October 2, 2022 10:55
fakenews_train_evaluate_fn
# Defining training and evaluation functions
def train():
model.train()
total_loss, total_accuracy = 0, 0
for step,batch in enumerate(train_dataloader): # iterate over batches
if step % 50 == 0 and not step == 0: # progress update after every 50 batches.
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
batch = [r for r in batch] # push the batch to gpu
sent_id, mask, labels = batch
@skillcate
skillcate / fakenews_model_architecture.py
Created October 2, 2022 10:52
fakenews_model_architecture
class BERT_Arch(nn.Module):
def __init__(self, bert):
super(BERT_Arch, self).__init__()
self.bert = bert
self.dropout = nn.Dropout(0.1) # dropout layer
self.relu = nn.ReLU() # relu activation function
self.fc1 = nn.Linear(768,512) # dense layer 1
self.fc2 = nn.Linear(512,2) # dense layer 2 (Output layer)
self.softmax = nn.LogSoftmax(dim=1) # softmax activation function
def forward(self, sent_id, mask): # define the forward pass
@skillcate
skillcate / fakenews_freeze_weights.py
Created October 2, 2022 10:50
fakenews_freeze_weights
# Freezing the parameters and defining trainable BERT structure
for param in bert.parameters():
param.requires_grad = False # false here means gradient need not be computed
@skillcate
skillcate / fakenews_preprocessing.py
Created October 2, 2022 10:48
fakenews_preprocessing
# Majority of titles above have word length under 15. So, we set max title length as 15
MAX_LENGHT = 15
# Tokenize and encode sequences in the train set
tokens_train = tokenizer.batch_encode_plus(
train_text.tolist(),
max_length = MAX_LENGHT,
pad_to_max_length=True,
truncation=True
)
# tokenize and encode sequences in the validation set
@skillcate
skillcate / fakenews_load_bert.py
Last active October 2, 2022 10:47
fakenews_load_bert
# Load BERT model and tokenizer via HuggingFace Transformers
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# Plot histogram of the number of words in train data 'title'
seq_len = [len(title.split()) for title in train_text]
pd.Series(seq_len).hist(bins = 40,color='firebrick')
plt.xlabel('Number of Words')
plt.ylabel('Number of texts')
@skillcate
skillcate / fakenews_traintestsplit.py
Created October 2, 2022 10:35
fakenews_traintestsplit
# Train-Validation-Test set split into 70:15:15 ratio
# Train-Temp split
train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['label'],
random_state=2018,
test_size=0.3,
stratify=data['Target'])
# Validation-Test split
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
random_state=2018,
test_size=0.5,
@skillcate
skillcate / fakenews_load_dataset.py
Created October 2, 2022 10:32
fakenews_load_dataset
# Load Dataset
true_data = pd.read_csv('a1_True.csv')
fake_data = pd.read_csv('a2_Fake.csv')
# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)
# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index'])
# Target column is made of string values True/Fake, let's change it to numbers 0/1 (Fake=1)
data['label'] = pd.get_dummies(data.Target)['Fake']