Shivampanwar

## finetuning.py


import pandas as pd
from datasets import load_dataset


huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

## generate_template.py
def build_llama2_prompt(messages):
    startPrompt = "<s>[INST] "
    endPrompt = " [/INST]"
    conversation = []
    for index, message in enumerate(messages):
        if message["role"] == "system" and index == 0:
            conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n\n")
        elif message["role"] == "user":
            conversation.append(message["content"].strip())
        else:

## Install CUDA pytorch tensorflow.md

      
              1 file
            
          
              1 fork
            
          
              1 comment
            
          
              3 stars
            
          
                Shivampanwar
                / Install CUDA pytorch tensorflow.md
            
            
              Last active
              November 12, 2020 20:10
            
              
                Installing any version of CUDA on Ubuntu and installing GPU versions of both torch and tensorflow
              
          
    It contains steps for installing CUDA. It further shows how one can install Tensorflow and Pytorch and
use GPUs with them.Our motive is to get output as in the image below,

where nvidia-smi, nvcc --version,
torch.cuda.is_available(),tf.test.gpu_devic_name() all are working. Let's go then.
I will specifically install CUDA 10.1 where both Tensorflow and Pytorch works fine.

  
## tokenization.py
train_df.review = train_df.review.str.lower()
sentences = train_df.review.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = train_df.sentiment.values

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")

## Finetune
##combine train and test reviews
lm_df = pd.concat([train_df[['review']],test_df[['review']]])
lm_df.review = lm_df.review.str.lower()
tqdm.pandas()
## We need one sentence per line with space between two lines
changed_text=lm_df.review.apply(lambda x:x+"\n"+"\n")
open(os.path.join(directory_path,'data_lm.txt'), "w").write(''.join(changed_text))
##We need to create data in Bert Format, below command will do that
!python3 pregenerate_training_data.py --train_corpus data_lm.txt --bert_model bert-base-uncased --do_lower_case --output_dir training/ --epochs_to_generate 2 --max_seq_len 256
##We would now use this data to finetune the model


	import pandas as pd
	from datasets import load_dataset



	huggingface_dataset_name = "knkarthick/dialogsum"

	dataset = load_dataset(huggingface_dataset_name)
	def build_llama2_prompt(messages):
	startPrompt = "<s>[INST] "
	endPrompt = " [/INST]"
	conversation = []
	for index, message in enumerate(messages):
	if message["role"] == "system" and index == 0:
	conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n\n")
	elif message["role"] == "user":
	conversation.append(message["content"].strip())
	else:
	train_df.review = train_df.review.str.lower()
	sentences = train_df.review.values

	# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
	sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
	labels = train_df.sentiment.values

	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
	print ("Tokenize the first sentence:")
	##combine train and test reviews
	lm_df = pd.concat([train_df[['review']],test_df[['review']]])
	lm_df.review = lm_df.review.str.lower()
	tqdm.pandas()
	## We need one sentence per line with space between two lines
	changed_text=lm_df.review.apply(lambda x:x+"\n"+"\n")
	open(os.path.join(directory_path,'data_lm.txt'), "w").write(''.join(changed_text))
	##We need to create data in Bert Format, below command will do that
	!python3 pregenerate_training_data.py --train_corpus data_lm.txt --bert_model bert-base-uncased --do_lower_case --output_dir training/ --epochs_to_generate 2 --max_seq_len 256
	##We would now use this data to finetune the model