beefy/generate.py

## generate.py
import pickle
import random

with open("markov.pickle", "rb") as f:
    markov = pickle.load(f)

length = 500
initial_word = random.choice(list(markov.keys()))
output = initial_word.decode("ascii", "ignore") + " "
previous_word = initial_word
for i in range(length - 1):
    try:
        next_word = random.choice(markov[previous_word])
    except KeyError:
        next_word = random.choice(list(markov.keys()))

    if "." in previous_word.decode("ascii", "ignore"):
        next_word = next_word.capitalize()

    output += next_word.decode("ascii", "ignore") + " "
    previous_word = next_word

print(output)

## preprocess.py
import os
import PyPDF2
import re
import enchant

english_words = enchant.Dict("en_US")
book_dir = "books/pdf"
output_dir = "books/txt"
for book in os.listdir(book_dir):
    print(book)
    pdf_file = open(book_dir+"/"+book, "rb")
    reader = PyPDF2.PdfFileReader(pdf_file)
    for page_num in range(reader.numPages):
        text = reader.getPage(page_num).extractText()
        output_file = open(output_dir+"/"+book+"_page_"+str(page_num)+".txt", "w")

        # remove extra whitespace
        pattern = re.compile(r'\s+')
        text = re.sub(pattern, ' ', text)

        # # remove punctuation and numbers
        # pattern = re.compile(r'[^a-zA-Z ]')
        # text = re.sub(pattern, '', text)

        # # remove non words
        # words = [
        #     word for word in text.split(' ')
        #     if word.strip() and english_words.check(word)
        # ]
        # text = ' '.join(words)

        # lower case
        text = text.lower()

        output_file.write(text)
        output_file.close()

## train.py
import os
import pickle

markov = {}

input_dir = "books/txt"
for book in os.listdir(input_dir):
    print(book)
    text = open(input_dir+"/"+book, "rb").read().strip()
    words = text.split()
    for word_num in range(len(words[:-1])):
        cur_word = words[word_num]
        next_word = words[word_num + 1]
        if cur_word not in markov:
            markov[cur_word] = [next_word]
        else:
            markov[cur_word].append(next_word)

with open("markov.pickle", "wb") as f:
    pickle.dump(markov, f)
	import pickle
	import random

	with open("markov.pickle", "rb") as f:
	markov = pickle.load(f)

	length = 500
	initial_word = random.choice(list(markov.keys()))
	output = initial_word.decode("ascii", "ignore") + " "
	previous_word = initial_word
	for i in range(length - 1):
	try:
	next_word = random.choice(markov[previous_word])
	except KeyError:
	next_word = random.choice(list(markov.keys()))

	if "." in previous_word.decode("ascii", "ignore"):
	next_word = next_word.capitalize()

	output += next_word.decode("ascii", "ignore") + " "
	previous_word = next_word

	print(output)
	import os
	import PyPDF2
	import re
	import enchant

	english_words = enchant.Dict("en_US")
	book_dir = "books/pdf"
	output_dir = "books/txt"
	for book in os.listdir(book_dir):
	print(book)
	pdf_file = open(book_dir+"/"+book, "rb")
	reader = PyPDF2.PdfFileReader(pdf_file)
	for page_num in range(reader.numPages):
	text = reader.getPage(page_num).extractText()
	output_file = open(output_dir+"/"+book+"_page_"+str(page_num)+".txt", "w")

	# remove extra whitespace
	pattern = re.compile(r'\s+')
	text = re.sub(pattern, ' ', text)

	# # remove punctuation and numbers
	# pattern = re.compile(r'[^a-zA-Z ]')
	# text = re.sub(pattern, '', text)

	# # remove non words
	# words = [
	# word for word in text.split(' ')
	# if word.strip() and english_words.check(word)
	# ]
	# text = ' '.join(words)

	# lower case
	text = text.lower()

	output_file.write(text)
	output_file.close()
	import os
	import pickle

	markov = {}

	input_dir = "books/txt"
	for book in os.listdir(input_dir):
	print(book)
	text = open(input_dir+"/"+book, "rb").read().strip()
	words = text.split()
	for word_num in range(len(words[:-1])):
	cur_word = words[word_num]
	next_word = words[word_num + 1]
	if cur_word not in markov:
	markov[cur_word] = [next_word]
	else:
	markov[cur_word].append(next_word)

	with open("markov.pickle", "wb") as f:
	pickle.dump(markov, f)