gordonbrander/Textiness.md

## Textiness.md

      
    Raw
  

              Textiness.md
            
          
    Textiness

Example using pdfminer to extract text and then filter out lines that aren't "texty" enough.
pdf2txt.py something.pdf | ./textiness.py > temp.txt

For example, I'm personally using this to extract the text of the IPCC report on 1.5°C:
pdf2txt.py SR15_Chapter4_Low_Res.pdf | ./unwrap.py -e ".}" | ./textiness.py > SR15_Chapter4_Low_Res.txt

unwrap.py takes care of reformatting lines of text into coherent paragraphs. textiness.py then filters out lines that don't appear to be texty.

  
## textiness.py
#!/usr/bin/env python3
"""
Filter lines of text by "textiness".
"""
import argparse
import sys
import re
import string

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-f", "--file", type=str, default="-", help="Input file (- for stdin)")
parser.add_argument("-o", "--output", type=str, default="-", help="Output file (- for stdout)")
parser.add_argument("-w", "--word_threshold", type=int, default=10, help="Word threshold")
parser.add_argument("-p", "--prosiness_threshold", type=float, default=0.8, help="Prosiness threshold")


def count_words(line):
    return len(line.split())


PROSEY = r"[A-Za-z\,\.]"
NONPROSEY = r"[0-9_\@\&\^\%\$\#\<\>\[\]\{\}\*\/\\]"


def report_prosiness(line):
    return (
        len(re.findall(PROSEY, line)),
        len(re.findall(NONPROSEY, line))
    )


def is_texty(line, word_threshold=10, prosiness_threshold=0.8):
    prosey, nonprosey = report_prosiness(line)
    return (
        count_words(line) > word_threshold and
        (1.0 - (nonprosey / prosey)) > prosiness_threshold
    )


def process(lines, word_threshold=10, prosiness_threshold=0.8):
    for line in lines:
        if is_texty(line, word_threshold, prosiness_threshold):
            yield line


def main():
    args = parser.parse_args()
    input_file = open(args.file, "r") if args.file != "-" else sys.stdin
    output_file = open(args.output, "w") if args.output != "-" else sys.stdout
    processed = process(
        input_file,
        word_threshold=args.word_threshold,
        prosiness_threshold=args.prosiness_threshold
    )
    for line in processed:
        output_file.write(line)


if __name__ == '__main__':
    main()

## unwrap.py
#!/usr/bin/env python3
"""
Unwrap text and attemp to split into lines by paragraph using a heuristic.
"""
import argparse
import sys


parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-f", "--file", type=str, default="-", help="Input file (- for stdin)")
parser.add_argument("-o", "--output", type=str, default="-", help="Output file (- for stdout)")
parser.add_argument(
    "-e",
    "--paragraph_ending",
    type=str,
    default=".",
    help="Ending characters that may be considered paragraph breaks when found before newlines. "
        "This heuristic is used to unwrap hard-wrapped lines without losing paragraphs. "
        "List characters without spaces. "
        "(. by default)"
)


def _clean_line(line, paragraph_ending):
    line = line.strip()
    if line == "":
        return ""
    elif line.endswith(paragraph_ending):
        return line + "\n"
    else:
        return line + " "


def reformat_lines(lines, paragraph_ending):
    text = "".join(_clean_line(line, paragraph_ending) for line in lines)
    text = text.strip()
    return text.splitlines()


def main():
    args = parser.parse_args()
    input_file = open(args.file, "r") if args.file != "-" else sys.stdin
    output_file = open(args.output, "w") if args.output != "-" else sys.stdout
    paragraph_ending = tuple(args.paragraph_ending)

    for line in reformat_lines(input_file, paragraph_ending):
        print(line, file=output_file)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	Filter lines of text by "textiness".
	"""
	import argparse
	import sys
	import re
	import string

	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("-f", "--file", type=str, default="-", help="Input file (- for stdin)")
	parser.add_argument("-o", "--output", type=str, default="-", help="Output file (- for stdout)")
	parser.add_argument("-w", "--word_threshold", type=int, default=10, help="Word threshold")
	parser.add_argument("-p", "--prosiness_threshold", type=float, default=0.8, help="Prosiness threshold")


	def count_words(line):
	return len(line.split())


	PROSEY = r"[A-Za-z\,\.]"
	NONPROSEY = r"[0-9_\@\&\^\%\$\#\<\>\[\]\{\}\*\/\\]"


	def report_prosiness(line):
	return (
	len(re.findall(PROSEY, line)),
	len(re.findall(NONPROSEY, line))
	)


	def is_texty(line, word_threshold=10, prosiness_threshold=0.8):
	prosey, nonprosey = report_prosiness(line)
	return (
	count_words(line) > word_threshold and
	(1.0 - (nonprosey / prosey)) > prosiness_threshold
	)


	def process(lines, word_threshold=10, prosiness_threshold=0.8):
	for line in lines:
	if is_texty(line, word_threshold, prosiness_threshold):
	yield line


	def main():
	args = parser.parse_args()
	input_file = open(args.file, "r") if args.file != "-" else sys.stdin
	output_file = open(args.output, "w") if args.output != "-" else sys.stdout
	processed = process(
	input_file,
	word_threshold=args.word_threshold,
	prosiness_threshold=args.prosiness_threshold
	)
	for line in processed:
	output_file.write(line)


	if __name__ == '__main__':
	main()
	#!/usr/bin/env python3
	"""
	Unwrap text and attemp to split into lines by paragraph using a heuristic.
	"""
	import argparse
	import sys


	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("-f", "--file", type=str, default="-", help="Input file (- for stdin)")
	parser.add_argument("-o", "--output", type=str, default="-", help="Output file (- for stdout)")
	parser.add_argument(
	"-e",
	"--paragraph_ending",
	type=str,
	default=".",
	help="Ending characters that may be considered paragraph breaks when found before newlines. "
	"This heuristic is used to unwrap hard-wrapped lines without losing paragraphs. "
	"List characters without spaces. "
	"(. by default)"
	)


	def _clean_line(line, paragraph_ending):
	line = line.strip()
	if line == "":
	return ""
	elif line.endswith(paragraph_ending):
	return line + "\n"
	else:
	return line + " "


	def reformat_lines(lines, paragraph_ending):
	text = "".join(_clean_line(line, paragraph_ending) for line in lines)
	text = text.strip()
	return text.splitlines()


	def main():
	args = parser.parse_args()
	input_file = open(args.file, "r") if args.file != "-" else sys.stdin
	output_file = open(args.output, "w") if args.output != "-" else sys.stdout
	paragraph_ending = tuple(args.paragraph_ending)

	for line in reformat_lines(input_file, paragraph_ending):
	print(line, file=output_file)


	if __name__ == '__main__':
	main()