Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
How to load text to neural network using TorchText
from torchtext import data
#from torchtext.data import BucketIterator
from torchtext import datasets
def simple_tokinizer(text):
""" Simple tokenizer
"""
return text.split()
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, tokenize=simple_tokinizer)
LABEL = data.LabelField()
# it will download dataset automatically :) and make splits for train and validation
train_ds, valid_ds = datasets.IMDB.splits(TEXT, LABEL)
# show sample of train dataset
example = train_ds[0]
print(example.label)
print(example.text)
print(f'train={len(train_ds)} valid={len(valid_ds)}')
# build the vocabulary
TEXT.build_vocab(train_ds,min_freq=10, max_size=10000 ) #, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train_ds)
print(TEXT.vocab.freqs.most_common(20))
vocab = TEXT.vocab
vocab_size = len(vocab)
print(f'vocab_size={vocab_size}')
print(list(vocab.stoi.keys())[0:20])
print(vocab.itos[0:20])
print(vocab.vectors)
print(LABEL.vocab.stoi)
batch_size = 4
train_iter = data.BucketIterator(
train_ds, batch_size=batch_size, sort_key=lambda x: len(x.text), sort_within_batch=True)
valid_iter = data.BucketIterator(
valid_ds, batch_size=batch_size, sort_key=lambda x: len(x.text), sort_within_batch=True)
epoch = 1
# epoch loop
for e in range(epoch):
for batch_idx, batch in enumerate(train_iter):
# get text vecotr and label
batch_text = batch.text[0] # include lengths at [1]
batch_label = batch.label
print(batch_text)
print(batch_label)
# do what ever you want
# .
# .
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.