Skip to content

Instantly share code, notes, and snippets.

@leogao2
Last active July 26, 2020 23:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leogao2/8d4662dfb8e58e8c58ef94df5d46413d to your computer and use it in GitHub Desktop.
Save leogao2/8d4662dfb8e58e8c58ef94df5d46413d to your computer and use it in GitHub Desktop.
import os
import fasttext
import re
lid = fasttext.load_model("lid.176.bin")
def listdir(x):
return [x + '/' + fn for fn in os.listdir(x)]
def id(x):
return x
def mean(x):
x = list(x)
if not x: return 0
return sum(x) / len(x)
def nonzero(x):
return filter(id, x)
def is_letter(x):
return x in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
def all_equal(x):
return all([n == x[0] for n in x])
for f in listdir('txt/0'):
with open(f) as fin:
contents = fin.read()
if lid.predict(contents.replace('\n', ' ')[:1024])[0][0] == '__label__en':
mean_line_len = mean(nonzero(map(len, contents.split('\n'))))
# remove all books where each line is a line and not a paragraph/page
# in libgen half the pdfs are one line per paragraph and the others are one line per actual line
# and the code assumes we're working with the latter to make life easier
# TODO: for pdfs where 1 line != 1 paragraph, convert it into that format
if mean_line_len < 200:
continue
# remove hyphenated words (they look like: hyph- hyphenated)
contents = re.sub(r'(\w+)- \1', '\1', contents)
res = []
for para in nonzero(contents.split('\n')):
# remove short paras and those which contain this weird artifact
if len(para) < 100 or '\u0001' in para:
continue
# remove leading and trailing numbers (i.e page nums)
para = para.strip()
para = re.sub(r'^(\d+)', '', para)
para = re.sub(r'(\d+)$', '', para)
para = para.strip()
# not enough letters (i.e math, tables, etc)
if mean(map(is_letter, para)) < 0.80:
continue
res.append(para)
for i in range(len(res) - 2):
# remove any prefix that appears in both this and either of the next two paragraphs
# to try and get rid of prefixes
# TODO: spin this out into a fully fledged header cleaner
commpref = os.path.commonprefix([
res[i],
res[i+2],
])
if len(commpref) > 3: res[i] = res[i][len(commpref):]
commpref = os.path.commonprefix([
res[i],
res[i+1],
])
if len(commpref) > 3: res[i] = res[i][len(commpref):]
print('\n\n===============================================\n\n', '\n\n'.join(res), '\n\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment