Gopal Sharma skillcate

## fakenews_pred.py
# testing on unseen data
unseen_news_text = ["Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing",     # Fake
                    "WATCH: George W. Bush Calls Out Trump For Supporting White Supremacy",               # Fake
                    "U.S. lawmakers question businessman at 2016 Trump Tower meeting: sources",           # True
                    "Trump administration issues new rules on U.S. visa waivers"                          # True
                    ]
# tokenize and encode sequences in the test set
MAX_LENGHT = 15
tokens_unseen = tokenizer.batch_encode_plus(
    unseen_news_text,

## fakenews_perf.py
# load weights of best model
path = 'c1_fakenews_weights.pt'
model.load_state_dict(torch.load(path))

with torch.no_grad():
  preds = model(test_seq, test_mask)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

## fakenews_training.py
# Train and predict
best_valid_loss = float('inf')
train_losses=[]                   # empty lists to store training and validation loss of each epoch
valid_losses=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train()                       # train model
    valid_loss = evaluate()                    # evaluate model
    if valid_loss < best_valid_loss:              # save the best model

## fakenews_train_evaluate_fn.py
# Defining training and evaluation functions
def train():
  model.train()
  total_loss, total_accuracy = 0, 0

  for step,batch in enumerate(train_dataloader):                # iterate over batches
    if step % 50 == 0 and not step == 0:                        # progress update after every 50 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]                                  # push the batch to gpu
    sent_id, mask, labels = batch

## fakenews_model_architecture.py
class BERT_Arch(nn.Module):
    def __init__(self, bert):
      super(BERT_Arch, self).__init__()
      self.bert = bert
      self.dropout = nn.Dropout(0.1)            # dropout layer
      self.relu =  nn.ReLU()                    # relu activation function
      self.fc1 = nn.Linear(768,512)             # dense layer 1
      self.fc2 = nn.Linear(512,2)               # dense layer 2 (Output layer)
      self.softmax = nn.LogSoftmax(dim=1)       # softmax activation function
    def forward(self, sent_id, mask):           # define the forward pass

## fakenews_freeze_weights.py
# Freezing the parameters and defining trainable BERT structure
for param in bert.parameters():
    param.requires_grad = False    # false here means gradient need not be computed

## fakenews_preprocessing.py
# Majority of titles above have word length under 15. So, we set max title length as 15
MAX_LENGHT = 15
# Tokenize and encode sequences in the train set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set

## fakenews_load_bert.py
# Load BERT model and tokenizer via HuggingFace Transformers
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Plot histogram of the number of words in train data 'title'
seq_len = [len(title.split()) for title in train_text]
pd.Series(seq_len).hist(bins = 40,color='firebrick')
plt.xlabel('Number of Words')
plt.ylabel('Number of texts')

## fakenews_traintestsplit.py
# Train-Validation-Test set split into 70:15:15 ratio
# Train-Temp split
train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=data['Target'])
# Validation-Test split
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,

## fakenews_load_dataset.py
# Load Dataset
true_data = pd.read_csv('a1_True.csv')
fake_data = pd.read_csv('a2_Fake.csv')
# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)
# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index'])
# Target column is made of string values True/Fake, let's change it to numbers 0/1 (Fake=1)
data['label'] = pd.get_dummies(data.Target)['Fake']
	# testing on unseen data
	unseen_news_text = ["Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing", # Fake
	"WATCH: George W. Bush Calls Out Trump For Supporting White Supremacy", # Fake
	"U.S. lawmakers question businessman at 2016 Trump Tower meeting: sources", # True
	"Trump administration issues new rules on U.S. visa waivers" # True
	]
	# tokenize and encode sequences in the test set
	MAX_LENGHT = 15
	tokens_unseen = tokenizer.batch_encode_plus(
	unseen_news_text,
	# load weights of best model
	path = 'c1_fakenews_weights.pt'
	model.load_state_dict(torch.load(path))

	with torch.no_grad():
	preds = model(test_seq, test_mask)
	preds = preds.detach().cpu().numpy()

	preds = np.argmax(preds, axis = 1)
	print(classification_report(test_y, preds))
	# Train and predict
	best_valid_loss = float('inf')
	train_losses=[] # empty lists to store training and validation loss of each epoch
	valid_losses=[]

	for epoch in range(epochs):
	print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
	train_loss = train() # train model
	valid_loss = evaluate() # evaluate model
	if valid_loss < best_valid_loss: # save the best model
	# Defining training and evaluation functions
	def train():
	model.train()
	total_loss, total_accuracy = 0, 0

	for step,batch in enumerate(train_dataloader): # iterate over batches
	if step % 50 == 0 and not step == 0: # progress update after every 50 batches.
	print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
	batch = [r for r in batch] # push the batch to gpu
	sent_id, mask, labels = batch
	class BERT_Arch(nn.Module):
	def __init__(self, bert):
	super(BERT_Arch, self).__init__()
	self.bert = bert
	self.dropout = nn.Dropout(0.1) # dropout layer
	self.relu = nn.ReLU() # relu activation function
	self.fc1 = nn.Linear(768,512) # dense layer 1
	self.fc2 = nn.Linear(512,2) # dense layer 2 (Output layer)
	self.softmax = nn.LogSoftmax(dim=1) # softmax activation function
	def forward(self, sent_id, mask): # define the forward pass
	# Freezing the parameters and defining trainable BERT structure
	for param in bert.parameters():
	param.requires_grad = False # false here means gradient need not be computed
	# Majority of titles above have word length under 15. So, we set max title length as 15
	MAX_LENGHT = 15
	# Tokenize and encode sequences in the train set
	tokens_train = tokenizer.batch_encode_plus(
	train_text.tolist(),
	max_length = MAX_LENGHT,
	pad_to_max_length=True,
	truncation=True
	)
	# tokenize and encode sequences in the validation set
	# Load BERT model and tokenizer via HuggingFace Transformers
	bert = AutoModel.from_pretrained('bert-base-uncased')
	tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

	# Plot histogram of the number of words in train data 'title'
	seq_len = [len(title.split()) for title in train_text]
	pd.Series(seq_len).hist(bins = 40,color='firebrick')
	plt.xlabel('Number of Words')
	plt.ylabel('Number of texts')
	# Train-Validation-Test set split into 70:15:15 ratio
	# Train-Temp split
	train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['label'],
	random_state=2018,
	test_size=0.3,
	stratify=data['Target'])
	# Validation-Test split
	val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
	random_state=2018,
	test_size=0.5,
	# Load Dataset
	true_data = pd.read_csv('a1_True.csv')
	fake_data = pd.read_csv('a2_Fake.csv')
	# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
	true_data['Target'] = ['True']*len(true_data)
	fake_data['Target'] = ['Fake']*len(fake_data)
	# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
	data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index'])
	# Target column is made of string values True/Fake, let's change it to numbers 0/1 (Fake=1)
	data['label'] = pd.get_dummies(data.Target)['Fake']