priya-dwivedi/load_wikihow.py

## load_wikihow.py
## Load Data from NLP Library
from nlp import load_dataset
dataset = load_dataset('wikihow', 'all', data_dir='data/')
print(dataset.keys())
print("Size of train dataset: ", dataset['train'].shape)
print("Size of Validation dataset: ", dataset['validation'].shape)
## Look at Sample Examples
print(dataset['train'][0].keys())
print(" Example of text: ", dataset['train'][0]['text'])
print(" Example of Summary: ", dataset['train'][0]['headline'])
## Estimate Average Length of Text and Summary
tiny_dataset = dataset['train'].select(list(range(0, 100)))
text_len = []
summary_len=[]
for i in range(len(tiny_dataset)):
    example = tiny_dataset[i]
    text_example = example['text']
    text_example = text_example.replace('\n','')
    text_words = text_example.split()
    text_len.append(len(text_words))
    summary_example = example['headline']
    summary_example = summary_example.replace('\n','')
    summary_words = summary_example.split()
    summary_len.append(len(summary_words))

import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(text_len)
plt.show()
	## Load Data from NLP Library
	from nlp import load_dataset
	dataset = load_dataset('wikihow', 'all', data_dir='data/')
	print(dataset.keys())
	print("Size of train dataset: ", dataset['train'].shape)
	print("Size of Validation dataset: ", dataset['validation'].shape)
	## Look at Sample Examples
	print(dataset['train'][0].keys())
	print(" Example of text: ", dataset['train'][0]['text'])
	print(" Example of Summary: ", dataset['train'][0]['headline'])
	## Estimate Average Length of Text and Summary
	tiny_dataset = dataset['train'].select(list(range(0, 100)))
	text_len = []
	summary_len=[]
	for i in range(len(tiny_dataset)):
	example = tiny_dataset[i]
	text_example = example['text']
	text_example = text_example.replace('\n','')
	text_words = text_example.split()
	text_len.append(len(text_words))
	summary_example = example['headline']
	summary_example = summary_example.replace('\n','')
	summary_words = summary_example.split()
	summary_len.append(len(summary_words))

	import matplotlib.pyplot as plt
	%matplotlib inline
	plt.hist(text_len)
	plt.show()