Skip to content

Instantly share code, notes, and snippets.

@amakukha
Last active November 3, 2023 23:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amakukha/d519f9b09ed593bab9632bf12176c151 to your computer and use it in GitHub Desktop.
Save amakukha/d519f9b09ed593bab9632bf12176c151 to your computer and use it in GitHub Desktop.
Generates a huge artificial text
#!/usr/bin/env python3
'''
A script to generate text files that look like a novel in TXT form.
Words are completely made up, but vaguely resemble the Finnish language.
The resulting text uses ASCII encoding with only printable characters.
Distribution of words follows Zipf's law.
Standard parameters generate 1 GB text with 148391 distinct words.
Used to benchmark solutions of the Bentley's k most frequent words problem:
https://codegolf.stackexchange.com/q/188133/
'''
import re, sys
from random import randint, seed, shuffle, expovariate
from collections import Counter
import urllib.request
# NOTE: changing these parameters, apart from the book size, will change the text contents
BOOK_SIZE = 1<<30 # 1 GB
DISTINCT_WORDS = 5000000 # bigger number will allow longer longest word
MEAN = 15000 # bigger number will increase average word length
VOWELS = 'aeiouy'
FORBIDDEN = ['satan', 'lenin', 'stalin', 'hitl', 'naz', 'rus', 'putin']
seed(63245986) # with this seed, the title should be "Itera Aeno", md5sum = 4dcf116dc35156ec939f8cafd61bdf18
print('Getting the reference text', file=sys.stderr)
text = urllib.request.urlopen('http://www.gutenberg.org/files/11940/11940-8.txt').read().decode('ISO-8859-1').lower()
text = text.replace('ä','a').replace('å','a').replace('ö','o') # specific to Finnish
start = re.search('start of th.*$', text, re.M).end()
end = re.search('^.*end of th', text, re.M).start()
text = text[start:end].strip()
_, __, text = text.split('\n',2) # discard two first lines
print(text[:100], file=sys.stderr)
print('...', file=sys.stderr)
print(text[-100:], file=sys.stderr)
print('Getting reference words', file=sys.stderr)
all_words = re.findall('[a-z]+', text.lower())
all_words = set(all_words)
print(len(all_words),'reference words', file=sys.stderr)
print('Training Markov chain', file=sys.stderr)
markov = {'total': 0}
def train_markov(word):
global markov
m = markov
for letter in word:
if m==markov:
m['total'] += 1
if letter in m:
m = m[letter]
m['total'] += 1
else:
m[letter] = {'total': 1}
m = m[letter]
for word in all_words:
while len(word)>=3:
train_markov(word)
word = word[1:]
print('Generating artificial words', file=sys.stderr)
def next_letter(word):
global markov
m = markov
for letter in word[-2:]:
if letter not in m:
m = markov
m = m[letter]
i = randint(0, m['total']-1)
for c in range(ord('a'), ord('z')+1):
c = chr(c)
if c not in m:
continue
if m[c]['total'] > i:
return c
else:
i -= m[c]['total']
word_set = set()
word_list = []
while len(word_set)<DISTINCT_WORDS:
w = ''
m = markov
while not w or w in word_set:
w += next_letter(w)
if not [b for b in FORBIDDEN if b in w] and [v for v in VOWELS if v in w]:
word_set.add(w)
word_list.append(w)
if len(word_set) % 100000 == 0:
print(len(word_set),'words generated', file=sys.stderr)
del word_set
print('Capitalizing some words', file=sys.stderr)
for i in range(len(word_list)):
if not randint(0,100):
word_list[i] = word_list[i].capitalize()
print('Generating text', file=sys.stderr)
class Book:
TAB = ' '
LINE_WIDTH = 76
def __init__(self):
self.title = ''
self.author = ''
self.year = '2019'
self.verlag = '' # "Publisher"
self.line = ''
self.front = False
self.capitalize = True
self.counter = Counter()
self.length = 0
def len(self):
return self.length
def next_word(self, word):
self.counter.update([word])
if not self.front:
if self.title.count(' ') < 2:
self.title += word.capitalize() + ' '
elif self.author.count(' ') < 2:
self.author += word.capitalize() + ' '
else:
self.verlag = word.capitalize()
self.print_front()
return
if self.capitalize:
word = word.capitalize()
self.capitalize = False
paragraph = False
if not randint(0,9):
word += ','
elif not randint(0,9):
word += '.'
self.capitalize = True
if not randint(0,9):
paragraph = True
if len(self.line + ' ' + word) > self.LINE_WIDTH:
self.length += len(self.line) + 1
print(self.line)
self.line = word
elif paragraph:
self.line = self.line + ' ' + word + '\n'
self.length += len(self.line) + 1
print(self.line)
self.line = self.TAB
else:
self.line += ' ' + word
def print_front(self):
print(self.title.rstrip().upper()+'\n')
print(self.author.rstrip()+'\n')
print('(c) {}, {}, Public domain\n'.format(self.year, self.verlag))
self.line = self.TAB
self.front = True
def end(self):
if self.line.strip():
print(self.line+'.')
print('\n--' + '\n'*7 + 'Most common words:')
for w,f in self.counter.most_common()[:10]:
print('-',w)
LAMBDA = 1 / MEAN
book = Book()
while book.len() < BOOK_SIZE:
i = int(expovariate(LAMBDA))
if i < len(word_list):
book.next_word(word_list[i])
book.end()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment