zeph-turner/language_mashup_generator.py

## language_mashup_generator.py
'''
Dependency: textgenrnn (can be installed through pip: !pip install textgenrnn
in Jupyter Notebook)
Written and tested with Python 3.6 through Jupyter Notebook.

This function was created to automatically make a neural net that can
generate words from "mashup languages". Several source files are input,
each creating words, names, etc. from a single language. A neural net aggregates
the contents of the input files, lengthening shorter files to ensure each language
is represented in the output, trains a neural net on the aggregated
data, generates 2000 words at various temperatures, and saves them to a file.

This function reads words from the files named in the list `filenames`
(a list of strings) in the directory specified by `fileroot` (a string,
see default for example).

The files are "equalized" by randomly resampling from shorter files to make them
match the length (in words) of longer files, so that each file input
has an approximately-equal bearing on the neural net weights. The neural
net is trained on the combined wordlist, then generates new words.
The list of new words is compared to the original dataset and any words
that were reproduced verbatim are discarded. Lastly the new words are
saved to a file.

I recommend using a new directory (specified with `fileroot`) for each
new mashup to avoid clobbering of files, as filenames are static.

For input files, try word frequency lists from Wiktionary:
https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists .
Or try lists of first names from Wikipedia:
https://en.wikipedia.org/wiki/Category:Given_names_by_culture
'''
def generate_original_words(filenames, temps=[0.5, 0.8, 1.0, 1.2], fileroot="D:\\Documents\\Jupyter_Languages\\"):
	from textgenrnn import textgenrnn
	import random
    filelists = []

    #Open files, process to wordlists (stripping trailing whitespace),
    #and close.
    for filename in filenames:
        tempfile = open(fileroot + filename, mode="r", encoding="utf8")
        filelists.append([line.rstrip() for line in tempfile])
        tempfile.close()

    #Prepare to write all lists to the combined words temporary file.
    combined_words = open(fileroot + "combined_tmp.txt", mode="w",
                           encoding="utf8")

    #We will resample each list to make it match the length of the longest list.
    #This prevents the neural net from being biased towards one input file.
    maxlen = max([len(x) for x in filelists])

    evenlists = []
    for wordlist in filelists:
        templist = wordlist
        #Extend wordlist with a random resampling of itself.
        if len(templist) < maxlen:
            templist.extend(random.choices(templist, k=maxlen - len(templist)))
        evenlists.extend(templist)

    #Write all words to the aggregated file, save, and close.
    combined_words.write("\n".join(evenlists))
    combined_words.close()

    #Train the neural net.
    textgen = textgenrnn()
    textgen.train_from_file(fileroot + "combined_tmp.txt",
                        new_model=False,
                        gen_epochs=5,
                       num_epochs=10,
                       word_level=False,
                       verbose=0)

    #Generate a list of new words.
    #An equal number of words is generated for each temperature
    #in the temps parameter.
    new_words = textgen.generate(n=2000,
                    return_as_list=True,
                    temperature=temps,
                                       prefix="")

    #Convert to a set so we can use set difference to eliminate duplicates.
    new_words_set = set(new_words)

    #Reopen training set to compare with generated words.
    old_words_file = open(fileroot + "combined_tmp.txt", mode="r",
                         encoding="utf8")
    old_words_list = [line.rstrip() for line in old_words_file] #on my machine a \n is added to each word, so that is stripped here
    old_words_file.close()

    #Set difference the new words with the old.
    orig_new_words = new_words_set.difference(set(old_words_list))

    #Save new words to file.
    results_file = open(fileroot + "original_generated_words.txt", mode="w+",
                       encoding="utf8")
    results_file.write("\n".join(orig_new_words))
	'''
	Dependency: textgenrnn (can be installed through pip: !pip install textgenrnn
	in Jupyter Notebook)
	Written and tested with Python 3.6 through Jupyter Notebook.

	This function was created to automatically make a neural net that can
	generate words from "mashup languages". Several source files are input,
	each creating words, names, etc. from a single language. A neural net aggregates
	the contents of the input files, lengthening shorter files to ensure each language
	is represented in the output, trains a neural net on the aggregated
	data, generates 2000 words at various temperatures, and saves them to a file.

	This function reads words from the files named in the list `filenames`
	(a list of strings) in the directory specified by `fileroot` (a string,
	see default for example).

	The files are "equalized" by randomly resampling from shorter files to make them
	match the length (in words) of longer files, so that each file input
	has an approximately-equal bearing on the neural net weights. The neural
	net is trained on the combined wordlist, then generates new words.
	The list of new words is compared to the original dataset and any words
	that were reproduced verbatim are discarded. Lastly the new words are
	saved to a file.

	I recommend using a new directory (specified with `fileroot`) for each
	new mashup to avoid clobbering of files, as filenames are static.

	For input files, try word frequency lists from Wiktionary:
	https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists .
	Or try lists of first names from Wikipedia:
	https://en.wikipedia.org/wiki/Category:Given_names_by_culture
	'''
	def generate_original_words(filenames, temps=[0.5, 0.8, 1.0, 1.2], fileroot="D:\\Documents\\Jupyter_Languages\\"):
	from textgenrnn import textgenrnn
	import random
	filelists = []

	#Open files, process to wordlists (stripping trailing whitespace),
	#and close.
	for filename in filenames:
	tempfile = open(fileroot + filename, mode="r", encoding="utf8")
	filelists.append([line.rstrip() for line in tempfile])
	tempfile.close()

	#Prepare to write all lists to the combined words temporary file.
	combined_words = open(fileroot + "combined_tmp.txt", mode="w",
	encoding="utf8")

	#We will resample each list to make it match the length of the longest list.
	#This prevents the neural net from being biased towards one input file.
	maxlen = max([len(x) for x in filelists])

	evenlists = []
	for wordlist in filelists:
	templist = wordlist
	#Extend wordlist with a random resampling of itself.
	if len(templist) < maxlen:
	templist.extend(random.choices(templist, k=maxlen - len(templist)))
	evenlists.extend(templist)

	#Write all words to the aggregated file, save, and close.
	combined_words.write("\n".join(evenlists))
	combined_words.close()

	#Train the neural net.
	textgen = textgenrnn()
	textgen.train_from_file(fileroot + "combined_tmp.txt",
	new_model=False,
	gen_epochs=5,
	num_epochs=10,
	word_level=False,
	verbose=0)

	#Generate a list of new words.
	#An equal number of words is generated for each temperature
	#in the temps parameter.
	new_words = textgen.generate(n=2000,
	return_as_list=True,
	temperature=temps,
	prefix="")

	#Convert to a set so we can use set difference to eliminate duplicates.
	new_words_set = set(new_words)

	#Reopen training set to compare with generated words.
	old_words_file = open(fileroot + "combined_tmp.txt", mode="r",
	encoding="utf8")
	old_words_list = [line.rstrip() for line in old_words_file] #on my machine a \n is added to each word, so that is stripped here
	old_words_file.close()

	#Set difference the new words with the old.
	orig_new_words = new_words_set.difference(set(old_words_list))

	#Save new words to file.
	results_file = open(fileroot + "original_generated_words.txt", mode="w+",
	encoding="utf8")
	results_file.write("\n".join(orig_new_words))