Skip to content

Instantly share code, notes, and snippets.

@gordonbrander
Last active November 6, 2022 07:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gordonbrander/44621df8403442625e838babe3032db6 to your computer and use it in GitHub Desktop.
Save gordonbrander/44621df8403442625e838babe3032db6 to your computer and use it in GitHub Desktop.
textiness.py — script to filter text based on how "texty" it is

Textiness

Example using pdfminer to extract text and then filter out lines that aren't "texty" enough.

pdf2txt.py something.pdf | ./textiness.py > temp.txt

For example, I'm personally using this to extract the text of the IPCC report on 1.5°C:

pdf2txt.py SR15_Chapter4_Low_Res.pdf | ./unwrap.py -e ".}" | ./textiness.py > SR15_Chapter4_Low_Res.txt

unwrap.py takes care of reformatting lines of text into coherent paragraphs. textiness.py then filters out lines that don't appear to be texty.

#!/usr/bin/env python3
"""
Filter lines of text by "textiness".
"""
import argparse
import sys
import re
import string
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-f", "--file", type=str, default="-", help="Input file (- for stdin)")
parser.add_argument("-o", "--output", type=str, default="-", help="Output file (- for stdout)")
parser.add_argument("-w", "--word_threshold", type=int, default=10, help="Word threshold")
parser.add_argument("-p", "--prosiness_threshold", type=float, default=0.8, help="Prosiness threshold")
def count_words(line):
return len(line.split())
PROSEY = r"[A-Za-z\,\.]"
NONPROSEY = r"[0-9_\@\&\^\%\$\#\<\>\[\]\{\}\*\/\\]"
def report_prosiness(line):
return (
len(re.findall(PROSEY, line)),
len(re.findall(NONPROSEY, line))
)
def is_texty(line, word_threshold=10, prosiness_threshold=0.8):
prosey, nonprosey = report_prosiness(line)
return (
count_words(line) > word_threshold and
(1.0 - (nonprosey / prosey)) > prosiness_threshold
)
def process(lines, word_threshold=10, prosiness_threshold=0.8):
for line in lines:
if is_texty(line, word_threshold, prosiness_threshold):
yield line
def main():
args = parser.parse_args()
input_file = open(args.file, "r") if args.file != "-" else sys.stdin
output_file = open(args.output, "w") if args.output != "-" else sys.stdout
processed = process(
input_file,
word_threshold=args.word_threshold,
prosiness_threshold=args.prosiness_threshold
)
for line in processed:
output_file.write(line)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
"""
Unwrap text and attemp to split into lines by paragraph using a heuristic.
"""
import argparse
import sys
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-f", "--file", type=str, default="-", help="Input file (- for stdin)")
parser.add_argument("-o", "--output", type=str, default="-", help="Output file (- for stdout)")
parser.add_argument(
"-e",
"--paragraph_ending",
type=str,
default=".",
help="Ending characters that may be considered paragraph breaks when found before newlines. "
"This heuristic is used to unwrap hard-wrapped lines without losing paragraphs. "
"List characters without spaces. "
"(. by default)"
)
def _clean_line(line, paragraph_ending):
line = line.strip()
if line == "":
return ""
elif line.endswith(paragraph_ending):
return line + "\n"
else:
return line + " "
def reformat_lines(lines, paragraph_ending):
text = "".join(_clean_line(line, paragraph_ending) for line in lines)
text = text.strip()
return text.splitlines()
def main():
args = parser.parse_args()
input_file = open(args.file, "r") if args.file != "-" else sys.stdin
output_file = open(args.output, "w") if args.output != "-" else sys.stdout
paragraph_ending = tuple(args.paragraph_ending)
for line in reformat_lines(input_file, paragraph_ending):
print(line, file=output_file)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment