Pop101/pdf_extract.py

## pdf_extract.py
import sys
from utils import *
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTPage, LTTextLine

def avg_char_height(container:LTTextContainer):
    chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20])

    char_size = list(map(lambda c: c.size, chars))
    if len(char_size) > 0: return sum(char_size)/len(char_size)
    return 0

def most_common_font(page:LTPage):
    chars = n_sample(page, 3, required_types=[LTTextContainer, LTTextLine, LTChar], max_samples=[50, 3, 20])

    if len(chars) <= 0: return ''

    chars = [char.fontname for char in chars]
    return Freq_list(chars)[-1]

def get_font(container:LTTextContainer):
    chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20])
    chars = [char.fontname for char in chars]
    if len(chars) > 0:
        return Freq_list(chars)[-1]
    return ''


def parse_pdf(path:str, chapter_threshold:float=4.5, verbose=False):
    if verbose: print('Getting most common font... ')
    font, font_size = Freq_list(), Freq_list()
    for page in sample_list(extract_pages(path, maxpages=120), max_samples=120, fast=True):
        font.add(most_common_font(page))
        font_size.add_all([str(int(round(avg_char_height(c)))) for c in sample_list(page, fast=True, required_type=LTTextContainer)])
    if verbose: print('Most common font: '+str(font[-1])+' with size '+str(font_size[-1]))
    if verbose and len(font) > 3: print('The three most common fonts are '+str(font[-3:]))
    cptr_list = list()
    cptr_txt = ["", "", ""]
    for page_layout in extract_pages(path):
        font.add
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # If this element is a title (5x as big as normal text)
                if avg_char_height(element) > chapter_threshold*int(font_size[-1]):
                    if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0:
                        cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]})
                        if verbose: print('Finished chapter '+str(cptr_txt[0:2]))
                        if verbose: print('Words in chapter '+str(cptr_txt[2].count(' ')))

                    cptr_txt[0] = element.get_text().encode('ascii', errors='ignore').strip().decode(errors='ignore')
                    cptr_txt[1] = str(page_layout.pageid)
                    cptr_txt[2] = ""
                # If this element has the correct font and similar size, add it to the chapter
                if len(font) > 0 and font[-1] in get_font(element) and abs(int(font_size[-1]) - avg_char_height(element)) < 2:
                    cptr_txt[2] += element.get_text().encode('utf-7',errors='ignore').strip().decode(errors='ignore')
    if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0:
        cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]})
        if verbose: print('Finished chapter '+str(cptr_txt[0:2]))
        if verbose: print('Words in chapter '+str(cptr_txt[2].count(' ')))

    return cptr_list

import sys, getopt, os, json
from pathlib import Path

def mutate_file_extension(audio_path:Path, ext:str='.txt', dir:str='./pdfs'):
    if len(dir) <= 1: # if no dir is given, put them in the same dir the original is in
        return audio_path.parent / Path(audio_path.stem + ext)
    os.makedirs(dir, exist_ok=True)
    return Path(dir) / Path(audio_path.stem + ext)

HELP_STR = """Usage:
python3 pdfextract.py <pdf 1> <pdf 2> ....
Note that every hundred pages takes about 1 minute to parse!
\nCommand Line Options:
    -h --help: Prints this. Ignores all other options.
    -c --clean: Deletes the default or the set directory. Ignores all other options.
    -i --inputfile: Defines a list of subreddits (1 per line) to go through (instead of args).
    -o --outputdir: The output directory. The same dir as the given pdfs if not supplied.
    -t --threshold: How many times bigger text needs to be from surrounding text to be a chapter. Defaults to 4.5
    -s --single: All outputs be concaternated into a single file (pdfs.txt)
    -b --bypass: Skips the pdf if the target file already exists.
    -v --verbose: Enables more detailed printing.
"""


def main(argv):
    pdfpaths = []
    threshold = 4.5
    outputdir = ''
    single = False
    bypass = False
    verbose = False

    # Add all pdf paths
    for arg in argv:
        if not str(arg).startswith('-'):
            pdfpaths.append(arg)
            argv.remove(arg)
    try:
        opts, args = getopt.getopt(argv,"hi:o:sbvct",["help","inputfile=","outputfile=","single","bypass","verbose","clean","threshold="])
    except:
        print(HELP_STR)
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-h','--help'):
            print(HELP_STR)
            sys.exit(0)

        elif opt in ('-c', '--clean'):
            if os.path.exists(outputdir):
                import shutil
                shutil.rmtree(outputdir)
            sys.exit(0)

        elif opt in ('-s', '--single'):
            single = not single

        elif opt in ('-b', '--bypass'):
            bypass = not bypass

        elif opt in ('-v', '--verbose'):
            verbose = not verbose

        elif opt in ('-o','--outputdir'):
            outputdir = arg

        elif opt in ('-t', '--threshold'):
            try:
                threshold = float(opt)
            except ValueError:
                print('Invalid threshold! Must be a valid number')
                sys.exit(2)

        elif opt in ('-i', '--inputfile'):
            if not os.path.exists(arg):
                print('Input file invalid!')
                sys.exit(2)
            with open(arg, 'r') as file:
                pdfpaths.extend(file.readlines())

    # Catch no options given
    if len(pdfpaths) <= 0:
        print(HELP_STR)
        sys.exit(0)

    # Make working directory
    if len(outputdir) > 1 and not os.path.exists(outputdir):
        os.makedirs(outputdir)

    # Loop through all pdfs
    pdf_dict = dict()
    for pdf in pdfpaths:
        if verbose: print('Beginning pdf "'+str(pdf)+'"')

        pdf_path = Path(pdf)
        if not pdf_path.exists(): print('Error: pdf "'+pdf+'" does not exist')

        if mutate_file_extension(pdf_path, dir=outputdir).exists() and bypass:
            if verbose: print('Skipping pdf!')
            continue

        extracted_pdf = parse_pdf(pdf, verbose=verbose)
        if single:
            pdf_dict[pdf] = extracted_pdf
        else:
            extracted_pdf_dict = {'name':pdf_path.stem,'content': extracted_pdf}
            with open(os.fspath(mutate_file_extension(pdf_path, dir=outputdir)),'w') as file:
                json.dump(extracted_pdf_dict, file)

    if single:
        singleFile = os.path.join(outputdir,'pdfs.txt')
        with open(singleFile, 'w') as file:
            json.dump(pdf_dict, file)


if __name__ == "__main__":
    main(sys.argv[1:])

# To load:
# import ast
# ast.literal_eval(str)

## utils.py
import itertools
import collections.abc

class Freq_list:
    def __init__(self, base:iter=None):
        self.tracker = dict()
        if isinstance(base, collections.abc.Iterable): self.add_all(base)

    def add_all(self, iterable:iter):
        assert(isinstance(iterable, collections.abc.Iterable))
        for i in iterable:
            self.add(i)
    def add(self, element, value=1):
        if element in self.tracker:
            self.tracker[element] += value
        else:
            self.tracker[element] = value
    def __iter__(self):
        if len(self.tracker) <= 0: return iter([])
        return iter([t[0] for t in sorted(self.tracker.items(), key=lambda kv: (kv[1], kv[0]))])
    def __add__(self, other):
        assert isinstance(other, collections.abc.Iterable)
        combined = self.copy()
        for element in other:
            if isinstance(other, Freq_list): combined.add(element, value=other.get_freq(element))
            else: combined.add(element)
        return combined
    def copy(self):
        cp = Freq_list()
        for i in self:
            cp.add(i, value=self.get_freq(i))
        return cp
    def __dict__(self):
        return self.tracker.copy()
    def __len__(self):
        return sum(self.tracker.values())
    def get_freq(self, element):
        if element in self.tracker:
            return self.tracker[element]
        return 0
    def __getitem__(self, index):
        try:
            return list(self)[index]
        except IndexError:
            return None

def n_sample(collection,samples:int, max_samples = 5, fast=True, required_types=[]):
    required_types.extend([None]*samples)

    if not isinstance(max_samples, list): max_samples = [max_samples] * samples
    max_samples.extend([max_samples[-1]]*samples)

    sample = sample_list(collection, max_samples=max_samples[0], required_type=required_types[0], fast=fast)
    for i in range(1, samples):
        sample = [sample_list(obj, max_samples=max_samples[i], required_type=required_types[i], fast=fast) for obj in sample]
        sample = list(itertools.chain.from_iterable(sample))
    return sample

def sample_list(collection,max_samples=20,fast=False, required_type=None):
    if required_type != None and not isinstance(required_type, type):
        required_type = type(required_type)

    samples = list()
    if not fast:
        collection = list(collection)

        i = 0
        while i < len(collection):
            if required_type == None or isinstance(collection[i], required_type):
                samples.append(collection[i])
            elif required_type != None:
                i += 1
            else:
                i += max(1, int(len(collection)/max_samples))

    else:
        for element in collection:
            if required_type == None or isinstance(element, required_type):
                samples.append(element)
                #print(element)
                if len(samples) >= max_samples: break
    return samples
	import sys
	from utils import *
	from pdfminer.high_level import extract_pages
	from pdfminer.layout import LTTextContainer, LTChar, LTPage, LTTextLine

	def avg_char_height(container:LTTextContainer):
	chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20])

	char_size = list(map(lambda c: c.size, chars))
	if len(char_size) > 0: return sum(char_size)/len(char_size)
	return 0

	def most_common_font(page:LTPage):
	chars = n_sample(page, 3, required_types=[LTTextContainer, LTTextLine, LTChar], max_samples=[50, 3, 20])

	if len(chars) <= 0: return ''

	chars = [char.fontname for char in chars]
	return Freq_list(chars)[-1]

	def get_font(container:LTTextContainer):
	chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20])
	chars = [char.fontname for char in chars]
	if len(chars) > 0:
	return Freq_list(chars)[-1]
	return ''


	def parse_pdf(path:str, chapter_threshold:float=4.5, verbose=False):
	if verbose: print('Getting most common font... ')
	font, font_size = Freq_list(), Freq_list()
	for page in sample_list(extract_pages(path, maxpages=120), max_samples=120, fast=True):
	font.add(most_common_font(page))
	font_size.add_all([str(int(round(avg_char_height(c)))) for c in sample_list(page, fast=True, required_type=LTTextContainer)])
	if verbose: print('Most common font: '+str(font[-1])+' with size '+str(font_size[-1]))
	if verbose and len(font) > 3: print('The three most common fonts are '+str(font[-3:]))
	cptr_list = list()
	cptr_txt = ["", "", ""]
	for page_layout in extract_pages(path):
	font.add
	for element in page_layout:
	if isinstance(element, LTTextContainer):
	# If this element is a title (5x as big as normal text)
	if avg_char_height(element) > chapter_threshold*int(font_size[-1]):
	if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0:
	cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]})
	if verbose: print('Finished chapter '+str(cptr_txt[0:2]))
	if verbose: print('Words in chapter '+str(cptr_txt[2].count(' ')))

	cptr_txt[0] = element.get_text().encode('ascii', errors='ignore').strip().decode(errors='ignore')
	cptr_txt[1] = str(page_layout.pageid)
	cptr_txt[2] = ""
	# If this element has the correct font and similar size, add it to the chapter
	if len(font) > 0 and font[-1] in get_font(element) and abs(int(font_size[-1]) - avg_char_height(element)) < 2:
	cptr_txt[2] += element.get_text().encode('utf-7',errors='ignore').strip().decode(errors='ignore')
	if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0:
	cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]})
	if verbose: print('Finished chapter '+str(cptr_txt[0:2]))
	if verbose: print('Words in chapter '+str(cptr_txt[2].count(' ')))

	return cptr_list

	import sys, getopt, os, json
	from pathlib import Path

	def mutate_file_extension(audio_path:Path, ext:str='.txt', dir:str='./pdfs'):
	if len(dir) <= 1: # if no dir is given, put them in the same dir the original is in
	return audio_path.parent / Path(audio_path.stem + ext)
	os.makedirs(dir, exist_ok=True)
	return Path(dir) / Path(audio_path.stem + ext)

	HELP_STR = """Usage:
	python3 pdfextract.py <pdf 1> <pdf 2> ....
	Note that every hundred pages takes about 1 minute to parse!
	\nCommand Line Options:
	-h --help: Prints this. Ignores all other options.
	-c --clean: Deletes the default or the set directory. Ignores all other options.
	-i --inputfile: Defines a list of subreddits (1 per line) to go through (instead of args).
	-o --outputdir: The output directory. The same dir as the given pdfs if not supplied.
	-t --threshold: How many times bigger text needs to be from surrounding text to be a chapter. Defaults to 4.5
	-s --single: All outputs be concaternated into a single file (pdfs.txt)
	-b --bypass: Skips the pdf if the target file already exists.
	-v --verbose: Enables more detailed printing.
	"""


	def main(argv):
	pdfpaths = []
	threshold = 4.5
	outputdir = ''
	single = False
	bypass = False
	verbose = False

	# Add all pdf paths
	for arg in argv:
	if not str(arg).startswith('-'):
	pdfpaths.append(arg)
	argv.remove(arg)
	try:
	opts, args = getopt.getopt(argv,"hi:o:sbvct",["help","inputfile=","outputfile=","single","bypass","verbose","clean","threshold="])
	except:
	print(HELP_STR)
	sys.exit(2)
	for opt, arg in opts:
	if opt in ('-h','--help'):
	print(HELP_STR)
	sys.exit(0)

	elif opt in ('-c', '--clean'):
	if os.path.exists(outputdir):
	import shutil
	shutil.rmtree(outputdir)
	sys.exit(0)

	elif opt in ('-s', '--single'):
	single = not single

	elif opt in ('-b', '--bypass'):
	bypass = not bypass

	elif opt in ('-v', '--verbose'):
	verbose = not verbose

	elif opt in ('-o','--outputdir'):
	outputdir = arg

	elif opt in ('-t', '--threshold'):
	try:
	threshold = float(opt)
	except ValueError:
	print('Invalid threshold! Must be a valid number')
	sys.exit(2)

	elif opt in ('-i', '--inputfile'):
	if not os.path.exists(arg):
	print('Input file invalid!')
	sys.exit(2)
	with open(arg, 'r') as file:
	pdfpaths.extend(file.readlines())

	# Catch no options given
	if len(pdfpaths) <= 0:
	print(HELP_STR)
	sys.exit(0)

	# Make working directory
	if len(outputdir) > 1 and not os.path.exists(outputdir):
	os.makedirs(outputdir)

	# Loop through all pdfs
	pdf_dict = dict()
	for pdf in pdfpaths:
	if verbose: print('Beginning pdf "'+str(pdf)+'"')

	pdf_path = Path(pdf)
	if not pdf_path.exists(): print('Error: pdf "'+pdf+'" does not exist')

	if mutate_file_extension(pdf_path, dir=outputdir).exists() and bypass:
	if verbose: print('Skipping pdf!')
	continue

	extracted_pdf = parse_pdf(pdf, verbose=verbose)
	if single:
	pdf_dict[pdf] = extracted_pdf
	else:
	extracted_pdf_dict = {'name':pdf_path.stem,'content': extracted_pdf}
	with open(os.fspath(mutate_file_extension(pdf_path, dir=outputdir)),'w') as file:
	json.dump(extracted_pdf_dict, file)

	if single:
	singleFile = os.path.join(outputdir,'pdfs.txt')
	with open(singleFile, 'w') as file:
	json.dump(pdf_dict, file)



	if __name__ == "__main__":
	main(sys.argv[1:])

	# To load:
	# import ast
	# ast.literal_eval(str)
	import itertools
	import collections.abc

	class Freq_list:
	def __init__(self, base:iter=None):
	self.tracker = dict()
	if isinstance(base, collections.abc.Iterable): self.add_all(base)

	def add_all(self, iterable:iter):
	assert(isinstance(iterable, collections.abc.Iterable))
	for i in iterable:
	self.add(i)
	def add(self, element, value=1):
	if element in self.tracker:
	self.tracker[element] += value
	else:
	self.tracker[element] = value
	def __iter__(self):
	if len(self.tracker) <= 0: return iter([])
	return iter([t[0] for t in sorted(self.tracker.items(), key=lambda kv: (kv[1], kv[0]))])
	def __add__(self, other):
	assert isinstance(other, collections.abc.Iterable)
	combined = self.copy()
	for element in other:
	if isinstance(other, Freq_list): combined.add(element, value=other.get_freq(element))
	else: combined.add(element)
	return combined
	def copy(self):
	cp = Freq_list()
	for i in self:
	cp.add(i, value=self.get_freq(i))
	return cp
	def __dict__(self):
	return self.tracker.copy()
	def __len__(self):
	return sum(self.tracker.values())
	def get_freq(self, element):
	if element in self.tracker:
	return self.tracker[element]
	return 0
	def __getitem__(self, index):
	try:
	return list(self)[index]
	except IndexError:
	return None

	def n_sample(collection,samples:int, max_samples = 5, fast=True, required_types=[]):
	required_types.extend([None]*samples)

	if not isinstance(max_samples, list): max_samples = [max_samples] * samples
	max_samples.extend([max_samples[-1]]*samples)

	sample = sample_list(collection, max_samples=max_samples[0], required_type=required_types[0], fast=fast)
	for i in range(1, samples):
	sample = [sample_list(obj, max_samples=max_samples[i], required_type=required_types[i], fast=fast) for obj in sample]
	sample = list(itertools.chain.from_iterable(sample))
	return sample

	def sample_list(collection,max_samples=20,fast=False, required_type=None):
	if required_type != None and not isinstance(required_type, type):
	required_type = type(required_type)

	samples = list()
	if not fast:
	collection = list(collection)

	i = 0
	while i < len(collection):
	if required_type == None or isinstance(collection[i], required_type):
	samples.append(collection[i])
	elif required_type != None:
	i += 1
	else:
	i += max(1, int(len(collection)/max_samples))

	else:
	for element in collection:
	if required_type == None or isinstance(element, required_type):
	samples.append(element)
	#print(element)
	if len(samples) >= max_samples: break
	return samples