akshay-3apr/readintoPandas.py

## readintoPandas.py
df = pd.read_csv("raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
print(df.sample(5))

## create label and sentence list
sentences = df.sentence.values

#check distribution of data based on labels
print("Distribution of data based on labels: ",df.label.value_counts())

# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

## Import BERT tokenizer, that is used to convert our text into tokens that corresponds to BERT library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) for sent in sentences]
labels = df.label.values

print("Actual sentence before tokenization: ",sentences[2])
print("Encoded Input from dataset: ",input_ids[2])

## Create attention mask
attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
print(attention_masks[2])
	df = pd.read_csv("raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
	print(df.sample(5))

	## create label and sentence list
	sentences = df.sentence.values

	#check distribution of data based on labels
	print("Distribution of data based on labels: ",df.label.value_counts())

	# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
	# In the original paper, the authors used a length of 512.
	MAX_LEN = 128

	## Import BERT tokenizer, that is used to convert our text into tokens that corresponds to BERT library
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
	input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) for sent in sentences]
	labels = df.label.values

	print("Actual sentence before tokenization: ",sentences[2])
	print("Encoded Input from dataset: ",input_ids[2])

	## Create attention mask
	attention_masks = []
	## Create a mask of 1 for all input tokens and 0 for all padding tokens
	attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
	print(attention_masks[2])