leogao2/pdftxt_cleaner.py

## pdftxt_cleaner.py
import os
import fasttext
import re

lid = fasttext.load_model("lid.176.bin")

def listdir(x):
    return [x + '/' + fn for fn in os.listdir(x)]

def id(x):
    return x

def mean(x):
    x = list(x)
    if not x: return 0
    return sum(x) / len(x)

def nonzero(x):
    return filter(id, x)

def is_letter(x):
    return x in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

def all_equal(x):
    return all([n == x[0] for n in x])

for f in listdir('txt/0'):
    with open(f) as fin:
        contents = fin.read()
        if lid.predict(contents.replace('\n', ' ')[:1024])[0][0] == '__label__en':

            mean_line_len = mean(nonzero(map(len, contents.split('\n'))))

            # remove all books where each line is a line and not a paragraph/page
            # in libgen half the pdfs are one line per paragraph and the others are one line per actual line
            # and the code assumes we're working with the latter to make life easier
            # TODO: for pdfs where 1 line != 1 paragraph, convert it into that format
            if mean_line_len < 200:
                continue

            # remove hyphenated words (they look like: hyph- hyphenated)
            contents = re.sub(r'(\w+)- \1', '\1', contents)

            res = []

            for para in nonzero(contents.split('\n')):
                # remove short paras and those which contain this weird artifact
                if len(para) < 100 or '\u0001' in para:
                    continue

                # remove leading and trailing numbers (i.e page nums)
                para = para.strip()
                para = re.sub(r'^(\d+)', '', para)
                para = re.sub(r'(\d+)$', '', para)
                para = para.strip()

                # not enough letters (i.e math, tables, etc)
                if mean(map(is_letter, para)) < 0.80:
                    continue
                res.append(para)

            for i in range(len(res) - 2):
                # remove any prefix that appears in both this and either of the next two paragraphs
                # to try and get rid of prefixes
                # TODO: spin this out into a fully fledged header cleaner

                commpref = os.path.commonprefix([
                    res[i],
                    res[i+2],
                ])
                if len(commpref) > 3: res[i] = res[i][len(commpref):]

                commpref = os.path.commonprefix([
                    res[i],
                    res[i+1],
                ])
                if len(commpref) > 3: res[i] = res[i][len(commpref):]

            print('\n\n===============================================\n\n', '\n\n'.join(res), '\n\n')
	import os
	import fasttext
	import re

	lid = fasttext.load_model("lid.176.bin")

	def listdir(x):
	return [x + '/' + fn for fn in os.listdir(x)]

	def id(x):
	return x

	def mean(x):
	x = list(x)
	if not x: return 0
	return sum(x) / len(x)

	def nonzero(x):
	return filter(id, x)

	def is_letter(x):
	return x in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

	def all_equal(x):
	return all([n == x[0] for n in x])

	for f in listdir('txt/0'):
	with open(f) as fin:
	contents = fin.read()
	if lid.predict(contents.replace('\n', ' ')[:1024])[0][0] == '__label__en':

	mean_line_len = mean(nonzero(map(len, contents.split('\n'))))

	# remove all books where each line is a line and not a paragraph/page
	# in libgen half the pdfs are one line per paragraph and the others are one line per actual line
	# and the code assumes we're working with the latter to make life easier
	# TODO: for pdfs where 1 line != 1 paragraph, convert it into that format
	if mean_line_len < 200:
	continue

	# remove hyphenated words (they look like: hyph- hyphenated)
	contents = re.sub(r'(\w+)- \1', '\1', contents)

	res = []

	for para in nonzero(contents.split('\n')):
	# remove short paras and those which contain this weird artifact
	if len(para) < 100 or '\u0001' in para:
	continue

	# remove leading and trailing numbers (i.e page nums)
	para = para.strip()
	para = re.sub(r'^(\d+)', '', para)
	para = re.sub(r'(\d+)$', '', para)
	para = para.strip()

	# not enough letters (i.e math, tables, etc)
	if mean(map(is_letter, para)) < 0.80:
	continue
	res.append(para)

	for i in range(len(res) - 2):
	# remove any prefix that appears in both this and either of the next two paragraphs
	# to try and get rid of prefixes
	# TODO: spin this out into a fully fledged header cleaner

	commpref = os.path.commonprefix([
	res[i],
	res[i+2],
	])
	if len(commpref) > 3: res[i] = res[i][len(commpref):]

	commpref = os.path.commonprefix([
	res[i],
	res[i+1],
	])
	if len(commpref) > 3: res[i] = res[i][len(commpref):]

	print('\n\n===============================================\n\n', '\n\n'.join(res), '\n\n')