Skip to content

Instantly share code, notes, and snippets.

@Pop101
Created September 19, 2020 19:41
Show Gist options
  • Save Pop101/53797c0438f372e59bdc1885306f3fff to your computer and use it in GitHub Desktop.
Save Pop101/53797c0438f372e59bdc1885306f3fff to your computer and use it in GitHub Desktop.
A commandline interface for extracting a pdf's usable text into json files, separated by chapter.
import sys
from utils import *
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTPage, LTTextLine
def avg_char_height(container:LTTextContainer):
chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20])
char_size = list(map(lambda c: c.size, chars))
if len(char_size) > 0: return sum(char_size)/len(char_size)
return 0
def most_common_font(page:LTPage):
chars = n_sample(page, 3, required_types=[LTTextContainer, LTTextLine, LTChar], max_samples=[50, 3, 20])
if len(chars) <= 0: return ''
chars = [char.fontname for char in chars]
return Freq_list(chars)[-1]
def get_font(container:LTTextContainer):
chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20])
chars = [char.fontname for char in chars]
if len(chars) > 0:
return Freq_list(chars)[-1]
return ''
def parse_pdf(path:str, chapter_threshold:float=4.5, verbose=False):
if verbose: print('Getting most common font... ')
font, font_size = Freq_list(), Freq_list()
for page in sample_list(extract_pages(path, maxpages=120), max_samples=120, fast=True):
font.add(most_common_font(page))
font_size.add_all([str(int(round(avg_char_height(c)))) for c in sample_list(page, fast=True, required_type=LTTextContainer)])
if verbose: print('Most common font: '+str(font[-1])+' with size '+str(font_size[-1]))
if verbose and len(font) > 3: print('The three most common fonts are '+str(font[-3:]))
cptr_list = list()
cptr_txt = ["", "", ""]
for page_layout in extract_pages(path):
font.add
for element in page_layout:
if isinstance(element, LTTextContainer):
# If this element is a title (5x as big as normal text)
if avg_char_height(element) > chapter_threshold*int(font_size[-1]):
if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0:
cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]})
if verbose: print('Finished chapter '+str(cptr_txt[0:2]))
if verbose: print('Words in chapter '+str(cptr_txt[2].count(' ')))
cptr_txt[0] = element.get_text().encode('ascii', errors='ignore').strip().decode(errors='ignore')
cptr_txt[1] = str(page_layout.pageid)
cptr_txt[2] = ""
# If this element has the correct font and similar size, add it to the chapter
if len(font) > 0 and font[-1] in get_font(element) and abs(int(font_size[-1]) - avg_char_height(element)) < 2:
cptr_txt[2] += element.get_text().encode('utf-7',errors='ignore').strip().decode(errors='ignore')
if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0:
cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]})
if verbose: print('Finished chapter '+str(cptr_txt[0:2]))
if verbose: print('Words in chapter '+str(cptr_txt[2].count(' ')))
return cptr_list
import sys, getopt, os, json
from pathlib import Path
def mutate_file_extension(audio_path:Path, ext:str='.txt', dir:str='./pdfs'):
if len(dir) <= 1: # if no dir is given, put them in the same dir the original is in
return audio_path.parent / Path(audio_path.stem + ext)
os.makedirs(dir, exist_ok=True)
return Path(dir) / Path(audio_path.stem + ext)
HELP_STR = """Usage:
python3 pdfextract.py <pdf 1> <pdf 2> ....
Note that every hundred pages takes about 1 minute to parse!
\nCommand Line Options:
-h --help: Prints this. Ignores all other options.
-c --clean: Deletes the default or the set directory. Ignores all other options.
-i --inputfile: Defines a list of subreddits (1 per line) to go through (instead of args).
-o --outputdir: The output directory. The same dir as the given pdfs if not supplied.
-t --threshold: How many times bigger text needs to be from surrounding text to be a chapter. Defaults to 4.5
-s --single: All outputs be concaternated into a single file (pdfs.txt)
-b --bypass: Skips the pdf if the target file already exists.
-v --verbose: Enables more detailed printing.
"""
def main(argv):
pdfpaths = []
threshold = 4.5
outputdir = ''
single = False
bypass = False
verbose = False
# Add all pdf paths
for arg in argv:
if not str(arg).startswith('-'):
pdfpaths.append(arg)
argv.remove(arg)
try:
opts, args = getopt.getopt(argv,"hi:o:sbvct",["help","inputfile=","outputfile=","single","bypass","verbose","clean","threshold="])
except:
print(HELP_STR)
sys.exit(2)
for opt, arg in opts:
if opt in ('-h','--help'):
print(HELP_STR)
sys.exit(0)
elif opt in ('-c', '--clean'):
if os.path.exists(outputdir):
import shutil
shutil.rmtree(outputdir)
sys.exit(0)
elif opt in ('-s', '--single'):
single = not single
elif opt in ('-b', '--bypass'):
bypass = not bypass
elif opt in ('-v', '--verbose'):
verbose = not verbose
elif opt in ('-o','--outputdir'):
outputdir = arg
elif opt in ('-t', '--threshold'):
try:
threshold = float(opt)
except ValueError:
print('Invalid threshold! Must be a valid number')
sys.exit(2)
elif opt in ('-i', '--inputfile'):
if not os.path.exists(arg):
print('Input file invalid!')
sys.exit(2)
with open(arg, 'r') as file:
pdfpaths.extend(file.readlines())
# Catch no options given
if len(pdfpaths) <= 0:
print(HELP_STR)
sys.exit(0)
# Make working directory
if len(outputdir) > 1 and not os.path.exists(outputdir):
os.makedirs(outputdir)
# Loop through all pdfs
pdf_dict = dict()
for pdf in pdfpaths:
if verbose: print('Beginning pdf "'+str(pdf)+'"')
pdf_path = Path(pdf)
if not pdf_path.exists(): print('Error: pdf "'+pdf+'" does not exist')
if mutate_file_extension(pdf_path, dir=outputdir).exists() and bypass:
if verbose: print('Skipping pdf!')
continue
extracted_pdf = parse_pdf(pdf, verbose=verbose)
if single:
pdf_dict[pdf] = extracted_pdf
else:
extracted_pdf_dict = {'name':pdf_path.stem,'content': extracted_pdf}
with open(os.fspath(mutate_file_extension(pdf_path, dir=outputdir)),'w') as file:
json.dump(extracted_pdf_dict, file)
if single:
singleFile = os.path.join(outputdir,'pdfs.txt')
with open(singleFile, 'w') as file:
json.dump(pdf_dict, file)
if __name__ == "__main__":
main(sys.argv[1:])
# To load:
# import ast
# ast.literal_eval(str)
import itertools
import collections.abc
class Freq_list:
def __init__(self, base:iter=None):
self.tracker = dict()
if isinstance(base, collections.abc.Iterable): self.add_all(base)
def add_all(self, iterable:iter):
assert(isinstance(iterable, collections.abc.Iterable))
for i in iterable:
self.add(i)
def add(self, element, value=1):
if element in self.tracker:
self.tracker[element] += value
else:
self.tracker[element] = value
def __iter__(self):
if len(self.tracker) <= 0: return iter([])
return iter([t[0] for t in sorted(self.tracker.items(), key=lambda kv: (kv[1], kv[0]))])
def __add__(self, other):
assert isinstance(other, collections.abc.Iterable)
combined = self.copy()
for element in other:
if isinstance(other, Freq_list): combined.add(element, value=other.get_freq(element))
else: combined.add(element)
return combined
def copy(self):
cp = Freq_list()
for i in self:
cp.add(i, value=self.get_freq(i))
return cp
def __dict__(self):
return self.tracker.copy()
def __len__(self):
return sum(self.tracker.values())
def get_freq(self, element):
if element in self.tracker:
return self.tracker[element]
return 0
def __getitem__(self, index):
try:
return list(self)[index]
except IndexError:
return None
def n_sample(collection,samples:int, max_samples = 5, fast=True, required_types=[]):
required_types.extend([None]*samples)
if not isinstance(max_samples, list): max_samples = [max_samples] * samples
max_samples.extend([max_samples[-1]]*samples)
sample = sample_list(collection, max_samples=max_samples[0], required_type=required_types[0], fast=fast)
for i in range(1, samples):
sample = [sample_list(obj, max_samples=max_samples[i], required_type=required_types[i], fast=fast) for obj in sample]
sample = list(itertools.chain.from_iterable(sample))
return sample
def sample_list(collection,max_samples=20,fast=False, required_type=None):
if required_type != None and not isinstance(required_type, type):
required_type = type(required_type)
samples = list()
if not fast:
collection = list(collection)
i = 0
while i < len(collection):
if required_type == None or isinstance(collection[i], required_type):
samples.append(collection[i])
elif required_type != None:
i += 1
else:
i += max(1, int(len(collection)/max_samples))
else:
for element in collection:
if required_type == None or isinstance(element, required_type):
samples.append(element)
#print(element)
if len(samples) >= max_samples: break
return samples
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment