Skip to content

Instantly share code, notes, and snippets.

@jsettlem
Created June 19, 2023 23:24
Show Gist options
  • Save jsettlem/f8b5c38555dfa302a640434da4a7353e to your computer and use it in GitHub Desktop.
Save jsettlem/f8b5c38555dfa302a640434da4a7353e to your computer and use it in GitHub Desktop.
import codecs
import random
from pprint import pprint
from sklearn.manifold import TSNE
from collections import defaultdict
from gensim.models import KeyedVectors
import numpy as np
import json
from sklearn.neighbors import NearestNeighbors
import struct
from semantle_words_3 import frpergJbeqf as old_frpergJbeqf
from semantle_words_4 import frpergJbeqf
frequency = defaultdict(lambda: 0)
with open("../../../enwiki-20190320-words-frequency.txt", 'r', encoding="utf-8") as f:
for line in f:
pair = line.split()
frequency[pair[0].lower()] = int(pair[1])
def normalize(v):
norm = np.linalg.norm(v)
if norm == 0:
return v
return v / norm
def pol2cart(rho, phi):
x = rho * np.cos(phi)
y = rho * np.sin(phi)
return (x, y)
def generate_puzzle(puzzle_index = 1, puzzle_type="pimantle", word=""):
model: KeyedVectors = KeyedVectors.load("../../../pruned_words.bin")
if puzzle_type == "semantle":
secret_word = codecs.encode(frpergJbeqf[puzzle_index], 'rot_13')
else:
secret_word = word
best_1000 = model.similar_by_key(secret_word, topn=len(model.index_to_key))
best_1000_labels = [secret_word] + [x[0] for x in best_1000]
best_1000_indexes = [model.key_to_index[word] for word in best_1000_labels]
best_1000_similarity = [1.0] + [x[1] for x in best_1000]
best_1000_vectors = [model[secret_word]] + np.array([model[x] for x in best_1000_labels])
for vec in best_1000_vectors:
vec[:] = normalize(vec)
best_1000_scaler = TSNE(n_components=1, init="random", learning_rate="auto", verbose=1).fit_transform(
best_1000_vectors)
twimst_factor = random.uniform(-1.5 * np.pi, 1.5 * np.pi)
twist_factor = random.uniform(-2 * np.pi, 2 * np.pi)
similarity_array = np.array(best_1000_similarity)
best_1000_polar = np.column_stack(pol2cart(
50_000 ** (1 - similarity_array) - 1,
np.squeeze(
(best_1000_scaler - np.min(best_1000_scaler)) / np.ptp(best_1000_scaler) * np.pi * 2
) + similarity_array * twimst_factor + twist_factor))
best_1000_vecs = best_1000_polar / 50_000
neighbors = NearestNeighbors(n_neighbors=10, n_jobs=-1).fit(best_1000_vecs).kneighbors(return_distance=False)
neighbor_similarities = []
neighbor_distances = []
for i, neighbor in enumerate(neighbors):
node = best_1000_labels[i]
neighbor_words = [best_1000_labels[x] for x in neighbor]
neighbor_similarities.append([model.similarity(node, x) for x in neighbor_words])
neighbor_distances.append([best_1000_vecs[x] - best_1000_vecs[i] for x in neighbor])
neighbor_similarities = np.array(neighbor_similarities, ndmin=2)
neighbor_distances = np.array(neighbor_distances, ndmin=3)
total_distances = np.array([
[distance * similarity * (1 - radius) for distance, similarity in zip(distances, similarities)] for
distances, similarities, radius in zip(neighbor_distances, neighbor_similarities, best_1000_similarity)
])
total_distances = np.sum(total_distances, axis=1)
best_1000_vecs += total_distances
dump = []
for i in range(len(best_1000_labels)):
dump.append([best_1000_indexes[i], round(best_1000_vecs[i][0], 6), round(best_1000_vecs[i][1], 6),
round(best_1000_similarity[i], 3)])
dump.sort(key=lambda x: x[3], reverse=True)
buff = bytearray()
buff.extend(struct.pack("<I", model.key_to_index[secret_word]))
for entry in dump:
buff.extend(struct.pack("<Iffe", entry[0], entry[1], entry[2], entry[3]))
with open(f"../public/{'secret_words' if puzzle_type == 'pimantle' else 'semantle_words'}/secret_word_{puzzle_index}.bin", "wb") as f:
f.write(buff)
pimantle_puzzles = []
with open("pimantle_words_2.txt", 'r', encoding="utf-8") as f:
for line in f:
pimantle_puzzles.append(line.strip())
random.shuffle(pimantle_puzzles)
for i, puzzle in enumerate(pimantle_puzzles):
generate_puzzle(puzzle_index=i + 233, word=puzzle, puzzle_type="pimantle")
with open("puzzle_log.txt", 'a') as f:
f.write(f"pimantle puzzle{i + 233} generated: {codecs.encode(puzzle, 'rot_13')}\n")
# for i in range(200, 400):
# print("generating semantle puzzle", i)
# if i > 199 or frpergJbeqf[i] != old_frpergJbeqf[i]:
# generate_puzzle(puzzle_index=i, puzzle_type="semantle", word=frpergJbeqf[i])
# else:
# print("skipping", i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment