Created
September 2, 2020 15:58
-
-
Save priya-dwivedi/1c842ea402a8bd9a1b79e785fe65a9c0 to your computer and use it in GitHub Desktop.
T5 Load WikiHow Data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Load Data from NLP Library | |
from nlp import load_dataset | |
dataset = load_dataset('wikihow', 'all', data_dir='data/') | |
print(dataset.keys()) | |
print("Size of train dataset: ", dataset['train'].shape) | |
print("Size of Validation dataset: ", dataset['validation'].shape) | |
## Look at Sample Examples | |
print(dataset['train'][0].keys()) | |
print(" Example of text: ", dataset['train'][0]['text']) | |
print(" Example of Summary: ", dataset['train'][0]['headline']) | |
## Estimate Average Length of Text and Summary | |
tiny_dataset = dataset['train'].select(list(range(0, 100))) | |
text_len = [] | |
summary_len=[] | |
for i in range(len(tiny_dataset)): | |
example = tiny_dataset[i] | |
text_example = example['text'] | |
text_example = text_example.replace('\n','') | |
text_words = text_example.split() | |
text_len.append(len(text_words)) | |
summary_example = example['headline'] | |
summary_example = summary_example.replace('\n','') | |
summary_words = summary_example.split() | |
summary_len.append(len(summary_words)) | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
plt.hist(text_len) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment