Skip to content

Instantly share code, notes, and snippets.

@polm
Created October 17, 2019 12:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save polm/ef59bd501a35d75a3a3110a185c52059 to your computer and use it in GitHub Desktop.
Save polm/ef59bd501a35d75a3a3110a185c52059 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
# http://bytepawn.com/hacker-news-embeddings-with-pytorch.html
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
from random import choice, random, shuffle
torch.manual_seed(1)
game2links = {}
for line in open('game2link'):
game = json.loads(line)
game2links[game['title']] = game['links']
game2id = {kk:ii for ii, kk in enumerate(game2links.keys())}
id2game = {ii:gg for gg,ii in game2id.items()}
link2id = {}
for game, links in game2links.items():
for link in links:
if link not in link2id:
link2id[link] = len(link2id)
id2link = {ii:ll for ll, ii in link2id.items()}
pairs = []
for game, links in game2links.items():
for link in links:
pairs.append( [game2id[game], link2id[link]] )
class GameEmbedding(torch.nn.Module):
def __init__(self, num_games, num_links, embedding_dim=64):
super(GameEmbedding, self).__init__()
self.game_embedding = torch.nn.Embedding(num_games, embedding_dim, max_norm=1.0)
self.link_embedding = torch.nn.Embedding(num_links, embedding_dim, max_norm=1.0)
self.embedding_dim = embedding_dim
def forward(self, batch):
# in the batch each input is [game, link, label]
# label is 1 (true) or -1 (false)
t1 = self.game_embedding(torch.LongTensor([v[0] for v in batch]))
t2 = self.link_embedding(torch.LongTensor([v[1] for v in batch]))
dot_products = torch.bmm(
t1.contiguous().view(len(batch), 1, self.embedding_dim),
t2.contiguous().view(len(batch), self.embedding_dim, 1)
)
return dot_products.contiguous().view(len(batch))
def build_minibatch(num_positives, num_negatives):
minibatch = []
for _ in range(num_positives):
minibatch.append(choice(pairs) + [1])
for _ in range(num_negatives):
while True:
gidx = int(random() * len(game2id))
lidx = int(random() * len(link2id))
if id2link[lidx] not in game2links[id2game[gidx]]:
break
minibatch.append([gidx, lidx, -1])
shuffle(minibatch)
#for game, link, ii in minibatch[:10]:
# print(ii, id2game[game], id2link[link], sep='\t')
return minibatch
embedding_dim = 64
model = GameEmbedding(len(game2id), len(link2id), embedding_dim)
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.MSELoss(reduction='mean')
num_epochs = 50
num_positives = 500
num_negatives = 500
num_steps_per_epoch = int(len(pairs) / num_positives)
for i in range(num_epochs):
for j in range(num_steps_per_epoch):
optimizer.zero_grad()
minibatch = build_minibatch(num_positives, num_negatives)
y = model.forward(minibatch)
target = torch.FloatTensor([v[2] for v in minibatch])
loss = loss_function(y, target)
if i == 0 and j == 0:
print('r: loss = %.3f' % float(loss))
loss.backward()
optimizer.step()
print('%s: loss = %.3f' % (i, float(loss)))
# print out some samples to see how good the fit is
minibatch = build_minibatch(5, 5)
y = model.forward(minibatch)
target = torch.FloatTensor([v[2] for v in minibatch])
print('Sample vectors:');
for i in range(5+5):
print('%.3f vs %.3f' % (float(y[i]), float(target[i])))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment