Skip to content

Instantly share code, notes, and snippets.

@beefy
Created February 10, 2022 13:35
Show Gist options
  • Save beefy/3f6b667c887cbd49794812b479757240 to your computer and use it in GitHub Desktop.
Save beefy/3f6b667c887cbd49794812b479757240 to your computer and use it in GitHub Desktop.
Generate text based on pdf files
import pickle
import random
with open("markov.pickle", "rb") as f:
markov = pickle.load(f)
length = 500
initial_word = random.choice(list(markov.keys()))
output = initial_word.decode("ascii", "ignore") + " "
previous_word = initial_word
for i in range(length - 1):
try:
next_word = random.choice(markov[previous_word])
except KeyError:
next_word = random.choice(list(markov.keys()))
if "." in previous_word.decode("ascii", "ignore"):
next_word = next_word.capitalize()
output += next_word.decode("ascii", "ignore") + " "
previous_word = next_word
print(output)
import os
import PyPDF2
import re
import enchant
english_words = enchant.Dict("en_US")
book_dir = "books/pdf"
output_dir = "books/txt"
for book in os.listdir(book_dir):
print(book)
pdf_file = open(book_dir+"/"+book, "rb")
reader = PyPDF2.PdfFileReader(pdf_file)
for page_num in range(reader.numPages):
text = reader.getPage(page_num).extractText()
output_file = open(output_dir+"/"+book+"_page_"+str(page_num)+".txt", "w")
# remove extra whitespace
pattern = re.compile(r'\s+')
text = re.sub(pattern, ' ', text)
# # remove punctuation and numbers
# pattern = re.compile(r'[^a-zA-Z ]')
# text = re.sub(pattern, '', text)
# # remove non words
# words = [
# word for word in text.split(' ')
# if word.strip() and english_words.check(word)
# ]
# text = ' '.join(words)
# lower case
text = text.lower()
output_file.write(text)
output_file.close()
import os
import pickle
markov = {}
input_dir = "books/txt"
for book in os.listdir(input_dir):
print(book)
text = open(input_dir+"/"+book, "rb").read().strip()
words = text.split()
for word_num in range(len(words[:-1])):
cur_word = words[word_num]
next_word = words[word_num + 1]
if cur_word not in markov:
markov[cur_word] = [next_word]
else:
markov[cur_word].append(next_word)
with open("markov.pickle", "wb") as f:
pickle.dump(markov, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment