Skip to content

Instantly share code, notes, and snippets.

@masterdezign
Last active April 1, 2024 20:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masterdezign/a3376551de80c30b4167b3182bb65ec2 to your computer and use it in GitHub Desktop.
Save masterdezign/a3376551de80c30b4167b3182bb65ec2 to your computer and use it in GitHub Desktop.
Character-level text generation with PyTorch LSTM
# Based on min-char-rnn.py
# https://gist.github.com/karpathy/d4dee566867f8291f086
import torch as th
import torch.nn as nn
import numpy as np
class Model(nn.Module):
def __init__(self, hidden_size, vocab_size, num_layers):
super(Model, self).__init__()
self.vocab_size = vocab_size
self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, h, train=True):
y, h = self.lstm(x, h)
y = self.fc(y)
if not train:
y = nn.functional.softmax(y, dim=1)
return y, h
def sample(model_state, seed_ix, n, device='cpu'):
"""
Sample a sequence of integers from the LSTM model.
seed_ix is a seed letter for the first time step.
"""
model, h = model_state
x = th.zeros(model.vocab_size).to(device)
x[seed_ix] = 1
ixes = [seed_ix]
with th.no_grad():
for t in range(n):
x = x.view(1, -1)
p, h = model(x, h, train=False)
p = p.ravel()
ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy())
x = th.zeros(model.vocab_size).to(device)
x[ix] = 1
ixes.append(ix)
return ixes
def main(device='cuda:0'):
# Data I/O
data = open("input.txt", "r").read() # Must be a simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print("data has %d characters, %d unique." % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}
# Hyperparameters
hidden_size = 512 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-2
num_layers = 3
model = Model(hidden_size, vocab_size, num_layers).to(device)
# Initialize optimizer
optimizer = th.optim.Adam(model.parameters(), lr=learning_rate)
# Train
n, p = 0, 0
# Initialize the hidden state
h0 = (th.zeros(num_layers, hidden_size).to(device),
th.zeros(num_layers, hidden_size).to(device))
for _ in range(10_000_000 + 1):
# Prepare inputs (we're sweeping from left to right in steps seq_length long)
if p + seq_length + 1 >= len(data) or n == 0:
# Reset LSTM memory
h0 = (th.zeros(num_layers, hidden_size).to(device),
th.zeros(num_layers, hidden_size).to(device))
# Go from start of data
p = 0
inputs = [char_to_ix[ch] for ch in data[p : p + seq_length]]
targets = [char_to_ix[ch] for ch in data[p + 1 : p + seq_length + 1]]
# Sample from the model now and then
if n % 1000 == 0:
sample_ix = sample((model, h0), inputs[0], 500, device=device)
txt = "".join(ix_to_char[ix] for ix in sample_ix)
print("\n%s\n----" % (txt,))
# From int to one-hot
tgt = th.zeros(len(targets), vocab_size).to(device)
for i in range(len(targets)):
tgt[i, targets[i]] = 1
x = th.zeros(len(inputs), vocab_size).to(device)
for i in range(len(inputs)):
x[i, inputs[i]] = 1
# Optimize LSTM with cross-entropy loss
optimizer.zero_grad()
y, h0 = model(x, h0)
loss = nn.functional.cross_entropy(y, tgt)
# Avoid RuntimeError: Trying to backward through the graph a second time
# Detach the hidden state from the graph
h0 = (h0[0].detach(), h0[1].detach())
if n % 10000 == 0:
progress = 100 * n * seq_length / data_size
print(f"{progress:.2f}% Loss: {loss.item():.4f}")
print("----")
loss.backward()
# Clip gradients to avoid exploding gradients
th.nn.utils.clip_grad_norm_(model.parameters(), 1)
optimizer.step()
if n % 250_000 == 0:
th.save(model.state_dict(), f"model_{n//1000}.pth")
# Save hidden state and chars
th.save((h0, chars), f"model_{n//1000}_state.pth")
p += seq_length # move data pointer
n += 1 # iteration counter
if __name__ == "__main__":
main()
import torch as th
import torch.nn as nn
import numpy as np
class Model(nn.Module):
def __init__(self, hidden_size, vocab_size, num_layers):
super(Model, self).__init__()
self.vocab_size = vocab_size
self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers)
self.fc = nn.Linear(hidden_size, vocab_size)
def forward(self, x, h, train=True):
y, h = self.lstm(x, h)
y = self.fc(y)
if not train:
y = nn.functional.softmax(y, dim=1)
return y, h
def sample(model_state, seed_ix, n, device="cpu"):
"""
Sample a sequence of integers from the LSTM model.
seed_ix is a seed letter for the first time step.
"""
model, h = model_state
x = th.zeros(model.vocab_size).to(device)
x[seed_ix] = 1
ixes = [seed_ix]
with th.no_grad():
for t in range(n):
x = x.view(1, -1)
p, h = model(x, h, train=False)
p = p.ravel()
ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy())
x = th.zeros(model.vocab_size).to(device)
x[ix] = 1
ixes.append(ix)
return ixes
def test_model(device="cpu", model_filename="model.pth", state_filename="model_state.pth"):
h0, chars = th.load(state_filename, map_location=device)
h1, h2 = h0
h1 = h1.to(device)
h2 = h2.to(device)
h0 = h1, h2
vocab_size = len(chars)
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}
# Hyperparameters
hidden_size = 128 # size of hidden layer of neurons
num_layers = 2
model = Model(hidden_size, vocab_size, num_layers).to(device)
# Load model to cpu
model.load_state_dict(th.load(model_filename, map_location=device))
model.eval()
sample_ix = sample((model, h0), char_to_ix["#"], 2000)
txt = "".join(ix_to_char[ix] for ix in sample_ix)
print(txt)
if __name__ == "__main__":
f = "model_1500"
test_model("cpu", f"{f}.pth", f"{f}_state.pth")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment