Created
February 10, 2022 13:35
-
-
Save beefy/3f6b667c887cbd49794812b479757240 to your computer and use it in GitHub Desktop.
Generate text based on pdf files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import random | |
with open("markov.pickle", "rb") as f: | |
markov = pickle.load(f) | |
length = 500 | |
initial_word = random.choice(list(markov.keys())) | |
output = initial_word.decode("ascii", "ignore") + " " | |
previous_word = initial_word | |
for i in range(length - 1): | |
try: | |
next_word = random.choice(markov[previous_word]) | |
except KeyError: | |
next_word = random.choice(list(markov.keys())) | |
if "." in previous_word.decode("ascii", "ignore"): | |
next_word = next_word.capitalize() | |
output += next_word.decode("ascii", "ignore") + " " | |
previous_word = next_word | |
print(output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import PyPDF2 | |
import re | |
import enchant | |
english_words = enchant.Dict("en_US") | |
book_dir = "books/pdf" | |
output_dir = "books/txt" | |
for book in os.listdir(book_dir): | |
print(book) | |
pdf_file = open(book_dir+"/"+book, "rb") | |
reader = PyPDF2.PdfFileReader(pdf_file) | |
for page_num in range(reader.numPages): | |
text = reader.getPage(page_num).extractText() | |
output_file = open(output_dir+"/"+book+"_page_"+str(page_num)+".txt", "w") | |
# remove extra whitespace | |
pattern = re.compile(r'\s+') | |
text = re.sub(pattern, ' ', text) | |
# # remove punctuation and numbers | |
# pattern = re.compile(r'[^a-zA-Z ]') | |
# text = re.sub(pattern, '', text) | |
# # remove non words | |
# words = [ | |
# word for word in text.split(' ') | |
# if word.strip() and english_words.check(word) | |
# ] | |
# text = ' '.join(words) | |
# lower case | |
text = text.lower() | |
output_file.write(text) | |
output_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pickle | |
markov = {} | |
input_dir = "books/txt" | |
for book in os.listdir(input_dir): | |
print(book) | |
text = open(input_dir+"/"+book, "rb").read().strip() | |
words = text.split() | |
for word_num in range(len(words[:-1])): | |
cur_word = words[word_num] | |
next_word = words[word_num + 1] | |
if cur_word not in markov: | |
markov[cur_word] = [next_word] | |
else: | |
markov[cur_word].append(next_word) | |
with open("markov.pickle", "wb") as f: | |
pickle.dump(markov, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment