jeacom25b/phrase_gen.py

## phrase_gen.py
from random import randint, shuffle, choice, random
from collections import defaultdict
from math import log10
import time
import re
import os

SEQ_LENGTH = 10
POPULATION = 100
REPETITION_PENALITY = 0.9
INPUT_TEXT = "data/gaming.txt"

RULES = (
    (2, 0.001),
    (3, 0.01),
    (4, 0.1),
    (5, 1),
    (6, 10),
    (7, 100),
    (8, -100000),
 )

TARGET_WORDS = ()

def pick(vocabulary, fequency_factor=20):
    index = round((random() ** fequency_factor) * (len(vocabulary) - 1))
    return vocabulary[index]

def pick2(vocabulary, fequency_factor=8):
    return pick(vocabulary, fequency_factor), pick(vocabulary, fequency_factor)

def neighborhood(seq, idx, radius):
    if radius + idx <= len(seq):
        return tuple(seq[idx:idx + radius])

def score_word_seq(word_seq, snippets_frequency):
    score = 0
    multiplier = 1
    seen = set()

    for word in word_seq:
        if word in seen:
            multiplier *= REPETITION_PENALITY
        else:
            seen.add(word)

    for target_word, target_count in TARGET_WORDS:
        if word_seq.count(target_word) != target_count:
            multiplier *= REPETITION_PENALITY

    for radius, score_multiplier in RULES:
        for i in range(len(word_seq)):
            neigh = neighborhood(word_seq, i, radius)
            score += score_multiplier if snippets_frequency.get(neigh) else 0
    return score * multiplier

def mutate(word_seq, context, vocabulary, memory, word_swap=0.03, split_swap=0.03):
    word_seq = word_seq.copy()

    index0 = randint(0, len(word_seq) - 1)
    index1 = randint(0, len(word_seq) - 1)
    index2 = randint(0, len(word_seq) - 1)

    word_seq[index1] = pick(vocabulary)

    if random() < word_swap:
        word = choice(memory)
        if word in context:
            word_seq[index1] = choice(context[word])

    if random() < word_swap:
        word_seq[index2] = pick(memory)

    if random() < word_swap:
        word_seq[index0], word_seq[index1] = word_seq[index1], word_seq[index0]

    if random() < split_swap:
        word_seq = word_seq[index0:] + word_seq[:index0]

    return word_seq

def crosover(word_seq_a, word_seq_b):
    return list(choice(x) for x in zip(word_seq_a, word_seq_b))

def context_gen(words_seq):
    context = defaultdict(set)
    for i in range(len(words_seq)):
        word_a = words_seq[i]
        context[word_a] |= set(words_seq[i - 5:i])

    for key in context:
        context[key] = list(context[key])

    return context

def preprocess(text):
    text = text.lower().replace("\n", " ")
    for i in range(5):
        text = text.replace("  ", " ")
    return text.split(" ")

vocabulary = []
context = {}
snippets_frequency = {}

with open(INPUT_TEXT, "r") as file:
    string = file.read()

    words_seq = preprocess(string)
    print(words_seq)
    print("so many words")
    print("Bulding a huge datastructure, hol-don!\n\n\n")
    context = context_gen(words_seq)
    words_frequency = {}

    for word in words_seq:
        words_frequency.setdefault(word, 0)
        words_frequency[word] += 1

    for key in words_frequency:
        words_frequency[key] = log10(words_frequency[key])

    for radius, score_multiplier in RULES:
        for i in range(len(words_seq)):
            neigh = neighborhood(words_seq, i, radius)
            if neigh:
                snippets_frequency.setdefault(neigh, 0)
                snippets_frequency[neigh] += 1

    vocabulary = list(item[0] for item in sorted(words_frequency.items(), key=lambda i: i[1], reverse=True))
    vocabulary = list(word for word, count in TARGET_WORDS) + vocabulary

def score(indiv):
    if not done:
        return score_word_seq(indiv, snippets_frequency)
    else:
        return score_word_seq(done[-1] + indiv, snippets_frequency)


individuals = [[pick(vocabulary) for _ in range(SEQ_LENGTH)] for _ in range(POPULATION)]

step = 0

done = []
memory = ["what"]

while True:
    step += 1
    individuals = list(sorted(individuals, key=score, reverse=True))
    if step % 100 == 0:
        os.system("clear")
        print("\n".join(" ".join(indiv) for indiv in individuals[:5]))
        print("\n")
        print("top gene score: ", score(individuals[0]))
        print(f"generation: {step}")
        print("memory: ", memory)
        print()
        print(" ".join(" ".join(indiv) for indiv in done))

    if step % 600 == 0:
        step = 0
        done.append(individuals[0])
        memory.extend(individuals[0])
        memory = list(sorted(set(memory), key=lambda w: words_frequency.get(w, -1)))[:5]
        individuals = [[pick(vocabulary) for _ in range(SEQ_LENGTH)] for _ in range(POPULATION)]

    individuals = [individuals[0]] + [mutate(crosover(*pick2(individuals)), context, vocabulary, memory) for _ in range(POPULATION - 1)]
	from random import randint, shuffle, choice, random
	from collections import defaultdict
	from math import log10
	import time
	import re
	import os

	SEQ_LENGTH = 10
	POPULATION = 100
	REPETITION_PENALITY = 0.9
	INPUT_TEXT = "data/gaming.txt"

	RULES = (
	(2, 0.001),
	(3, 0.01),
	(4, 0.1),
	(5, 1),
	(6, 10),
	(7, 100),
	(8, -100000),
	)

	TARGET_WORDS = ()

	def pick(vocabulary, fequency_factor=20):
	index = round((random() ** fequency_factor) * (len(vocabulary) - 1))
	return vocabulary[index]

	def pick2(vocabulary, fequency_factor=8):
	return pick(vocabulary, fequency_factor), pick(vocabulary, fequency_factor)

	def neighborhood(seq, idx, radius):
	if radius + idx <= len(seq):
	return tuple(seq[idx:idx + radius])

	def score_word_seq(word_seq, snippets_frequency):
	score = 0
	multiplier = 1
	seen = set()

	for word in word_seq:
	if word in seen:
	multiplier *= REPETITION_PENALITY
	else:
	seen.add(word)

	for target_word, target_count in TARGET_WORDS:
	if word_seq.count(target_word) != target_count:
	multiplier *= REPETITION_PENALITY

	for radius, score_multiplier in RULES:
	for i in range(len(word_seq)):
	neigh = neighborhood(word_seq, i, radius)
	score += score_multiplier if snippets_frequency.get(neigh) else 0
	return score * multiplier

	def mutate(word_seq, context, vocabulary, memory, word_swap=0.03, split_swap=0.03):
	word_seq = word_seq.copy()

	index0 = randint(0, len(word_seq) - 1)
	index1 = randint(0, len(word_seq) - 1)
	index2 = randint(0, len(word_seq) - 1)

	word_seq[index1] = pick(vocabulary)

	if random() < word_swap:
	word = choice(memory)
	if word in context:
	word_seq[index1] = choice(context[word])

	if random() < word_swap:
	word_seq[index2] = pick(memory)

	if random() < word_swap:
	word_seq[index0], word_seq[index1] = word_seq[index1], word_seq[index0]

	if random() < split_swap:
	word_seq = word_seq[index0:] + word_seq[:index0]

	return word_seq

	def crosover(word_seq_a, word_seq_b):
	return list(choice(x) for x in zip(word_seq_a, word_seq_b))

	def context_gen(words_seq):
	context = defaultdict(set)
	for i in range(len(words_seq)):
	word_a = words_seq[i]
	context[word_a] \|= set(words_seq[i - 5:i])

	for key in context:
	context[key] = list(context[key])

	return context

	def preprocess(text):
	text = text.lower().replace("\n", " ")
	for i in range(5):
	text = text.replace(" ", " ")
	return text.split(" ")

	vocabulary = []
	context = {}
	snippets_frequency = {}

	with open(INPUT_TEXT, "r") as file:
	string = file.read()

	words_seq = preprocess(string)
	print(words_seq)
	print("so many words")
	print("Bulding a huge datastructure, hol-don!\n\n\n")
	context = context_gen(words_seq)
	words_frequency = {}

	for word in words_seq:
	words_frequency.setdefault(word, 0)
	words_frequency[word] += 1

	for key in words_frequency:
	words_frequency[key] = log10(words_frequency[key])

	for radius, score_multiplier in RULES:
	for i in range(len(words_seq)):
	neigh = neighborhood(words_seq, i, radius)
	if neigh:
	snippets_frequency.setdefault(neigh, 0)
	snippets_frequency[neigh] += 1

	vocabulary = list(item[0] for item in sorted(words_frequency.items(), key=lambda i: i[1], reverse=True))
	vocabulary = list(word for word, count in TARGET_WORDS) + vocabulary

	def score(indiv):
	if not done:
	return score_word_seq(indiv, snippets_frequency)
	else:
	return score_word_seq(done[-1] + indiv, snippets_frequency)


	individuals = [[pick(vocabulary) for _ in range(SEQ_LENGTH)] for _ in range(POPULATION)]

	step = 0

	done = []
	memory = ["what"]

	while True:
	step += 1
	individuals = list(sorted(individuals, key=score, reverse=True))
	if step % 100 == 0:
	os.system("clear")
	print("\n".join(" ".join(indiv) for indiv in individuals[:5]))
	print("\n")
	print("top gene score: ", score(individuals[0]))
	print(f"generation: {step}")
	print("memory: ", memory)
	print()
	print(" ".join(" ".join(indiv) for indiv in done))

	if step % 600 == 0:
	step = 0
	done.append(individuals[0])
	memory.extend(individuals[0])
	memory = list(sorted(set(memory), key=lambda w: words_frequency.get(w, -1)))[:5]
	individuals = [[pick(vocabulary) for _ in range(SEQ_LENGTH)] for _ in range(POPULATION)]

	individuals = [individuals[0]] + [mutate(crosover(*pick2(individuals)), context, vocabulary, memory) for _ in range(POPULATION - 1)]