reagle/semantic_wrap.py

## semantic_wrap.py
#!/usr/bin/env python2.7
"""Wrap text, including semantically by add breaks at
terminal punctuation."""

# Using python 2.7 so portions can be used in Sublime Text 3 plugin

import argparse  # http://docs.python.org/dev/library/argparse.html
import logging
import re
import sys
import textwrap

debug = logging.debug
info = logging.info
warn = logging.warn
error = logging.error
critical = logging.critical
exception = logging.exception

SEMANTIC_BREAK_RE = re.compile(
    r"""
    (                     # end of sentence includes...
        [a-z]{2,}|          # end of word
        [0-9]{1,}           # end of a page or chapter number
    )
    (                     # terminal punctuation
        \.
        |\?
        |\!
        |\}
        |:

        |\.\)  # period paren
        |\)\.  # paren period
        |\.\]  # period bracket
        |\]\.  # bracket period
        |\."   # period quote
        |"\.   # quote quote
        |"\)\. # quote paren period

        |\?\)  # question paren
        |\)\?  # paren question
        |\?\]  # question bracket
        |\]\?  # bracket question
        |\?"   # question quote
        |"\?   # quote question

        |\!\)  # exclaim paren
        |\)\!  # paren exclaim
        |\!\]  # exclaim bracket
        |\]\!  # bracket exclaim
        |\!"   # exclaim quote
        |"\!   # quote exclaim
    )
    (\s)                 # a whitespace
    (?!\d)               # negative lookahead for digit
    """,
    re.VERBOSE,
)
QUOTES_RE = re.compile(
    r"""
    (^[> ]+)?   # start quotes
    (.*)       # rest of line
    """,
    re.VERBOSE,
)


def quoted_wrap(content, width):
    """wrap quoted lines, preserving quotes; collapse empty lines"""

    wrapper = textwrap.TextWrapper(width=width)
    quote_level = quote_level_prev = None
    buffer = [
        "",
    ]  # hold chunks of similar quote_level
    new_content = []  # hold rebuilt content
    for line in content.split("\n"):
        # quotes = text = ""
        info("---")
        info("line = '%s'" % line)
        quotes, text = QUOTES_RE.match(line).groups()
        quote_level = quotes.count(">") if quotes else 0
        if quote_level_prev is None:  # None on first iteration
            info("* no quote_level_prev")
            quote_level_prev = quote_level
        info("text = '%s'" % (text))
        info(
            "quote_level = %s quote_level_prev = %s"
            % (quote_level, quote_level_prev)
        )

        # if similar: accumulate lines of paragraph at same quote level
        if quote_level == quote_level_prev and text:
            buffer.append(text)
            info("buffer = %s" % (buffer))
        # else: add wrapped lines to new_content, start new buffer
        else:
            info("* change in quote level or new paragraph")
            info("buffer = %s" % (buffer))
            prefix = ">" * quote_level_prev + " " if quote_level_prev else ""
            wrapper.initial_indent = prefix
            wrapper.subsequent_indent = prefix
            buffer_joined = " ".join(buffer).strip()
            info("buffer_joined = '%s'" % (buffer_joined))
            new_content.append(wrapper.fill(buffer_joined))
            if not text:  # new paragraph
                info("* new paragraph")
                new_content.append(prefix)
            quote_level_prev = quote_level
            buffer = [text]
            info("NEW_CONTENT = %s" % (new_content))

    new_content = "\n".join(new_content)  # TODO: remove first empty
    new_content = re.sub(r"\n\s*\n", "\n\n", new_content)  # collapse empty
    return new_content


def semantic_wrap(text):
    """wrap quoted lines at terminal punctuations.

    ...split('\n\n')...startswith('>') doesn't catch a non-quoted line
    followed by quoted lines of changing levels
    (e.g., beginning of email: "On DATE, NAME wrote:").

    """

    new_content = []
    info("START")
    for line in re.split("\n\n", text):
        info("line = '%s'" % line)
        if line.startswith(">"):
            info("is quoted")
            quotes, line = QUOTES_RE.match(line).groups()
            line = line.replace(quotes, "")
            wrapped_line = SEMANTIC_BREAK_RE.sub(
                r"""\1\2\3\n%s""" % quotes, line
            )
            info("wrapped_line = '%s'" % wrapped_line)
            info("quotes = '%s'" % quotes)
            wrapped_line = quotes + wrapped_line + "\n"
        else:
            wrapped_line = SEMANTIC_BREAK_RE.sub(r"""\1\2\3\n""", line) + "\n"
        new_content.append(wrapped_line)
    return "\n".join(new_content)


def main(argv):
    """Process arguments"""
    arg_parser = argparse.ArgumentParser(
        description="a couple different types of line wrappers"
    )

    # positional arguments
    arg_parser.add_argument("files", nargs=1, metavar="FILE")
    # optional arguments
    arg_parser.add_argument(
        "-s",
        "--semantic",
        action="store_true",
        default=False,
        help="semantic wrap (at sentence endings)",
    )
    arg_parser.add_argument(
        "-w",
        "--wrap",
        type=int,
        default=72,
        help="wrap width (default: %(default)s)",
    )
    arg_parser.add_argument(
        "-o",
        "--out-filename",
        help="output results to filename",
        metavar="FILE",
    )
    arg_parser.add_argument(
        "-L",
        "--log-to-file",
        action="store_true",
        default=False,
        help="log to file %(prog)s.log",
    )
    arg_parser.add_argument(
        "-V",
        "--verbose",
        action="count",
        default=0,
        help="Increase verbosity (specify multiple times for more)",
    )
    arg_parser.add_argument("--version", action="version", version="TBD")
    args = arg_parser.parse_args(argv)

    log_level = 100  # default
    if args.verbose >= 3:
        log_level = logging.DEBUG  # 10
    elif args.verbose == 2:
        log_level = logging.INFO  # 20
    elif args.verbose == 1:
        log_level = logging.ERROR  # 40
    LOG_FORMAT = "%(levelno)s %(funcName).5s: %(message)s"
    if args.log_to_file:
        logging.basicConfig(
            filename="wrap.log",
            filemode="w",
            level=log_level,
            format=LOG_FORMAT,
        )
    else:
        logging.basicConfig(level=log_level, format=LOG_FORMAT)

    return args


if "__main__" == __name__:
    args = main(sys.argv[1:])
    content = open(args.files[0]).read()
    if args.semantic:
        print(semantic_wrap(content))
    else:
        print(quoted_wrap(content, args.wrap))
	#!/usr/bin/env python2.7
	"""Wrap text, including semantically by add breaks at
	terminal punctuation."""

	# Using python 2.7 so portions can be used in Sublime Text 3 plugin

	import argparse # http://docs.python.org/dev/library/argparse.html
	import logging
	import re
	import sys
	import textwrap

	debug = logging.debug
	info = logging.info
	warn = logging.warn
	error = logging.error
	critical = logging.critical
	exception = logging.exception

	SEMANTIC_BREAK_RE = re.compile(
	r"""
	( # end of sentence includes...
	[a-z]{2,}\| # end of word
	[0-9]{1,} # end of a page or chapter number
	)
	( # terminal punctuation
	\.
	\|\?
	\|\!
	\|\}
	\|:

	\|\.\) # period paren
	\|\)\. # paren period
	\|\.\] # period bracket
	\|\]\. # bracket period
	\|\." # period quote
	\|"\. # quote quote
	\|"\)\. # quote paren period

	\|\?\) # question paren
	\|\)\? # paren question
	\|\?\] # question bracket
	\|\]\? # bracket question
	\|\?" # question quote
	\|"\? # quote question

	\|\!\) # exclaim paren
	\|\)\! # paren exclaim
	\|\!\] # exclaim bracket
	\|\]\! # bracket exclaim
	\|\!" # exclaim quote
	\|"\! # quote exclaim
	)
	(\s) # a whitespace
	(?!\d) # negative lookahead for digit
	""",
	re.VERBOSE,
	)
	QUOTES_RE = re.compile(
	r"""
	(^[> ]+)? # start quotes
	(.*) # rest of line
	""",
	re.VERBOSE,
	)


	def quoted_wrap(content, width):
	"""wrap quoted lines, preserving quotes; collapse empty lines"""

	wrapper = textwrap.TextWrapper(width=width)
	quote_level = quote_level_prev = None
	buffer = [
	"",
	] # hold chunks of similar quote_level
	new_content = [] # hold rebuilt content
	for line in content.split("\n"):
	# quotes = text = ""
	info("---")
	info("line = '%s'" % line)
	quotes, text = QUOTES_RE.match(line).groups()
	quote_level = quotes.count(">") if quotes else 0
	if quote_level_prev is None: # None on first iteration
	info("* no quote_level_prev")
	quote_level_prev = quote_level
	info("text = '%s'" % (text))
	info(
	"quote_level = %s quote_level_prev = %s"
	% (quote_level, quote_level_prev)
	)

	# if similar: accumulate lines of paragraph at same quote level
	if quote_level == quote_level_prev and text:
	buffer.append(text)
	info("buffer = %s" % (buffer))
	# else: add wrapped lines to new_content, start new buffer
	else:
	info("* change in quote level or new paragraph")
	info("buffer = %s" % (buffer))
	prefix = ">" * quote_level_prev + " " if quote_level_prev else ""
	wrapper.initial_indent = prefix
	wrapper.subsequent_indent = prefix
	buffer_joined = " ".join(buffer).strip()
	info("buffer_joined = '%s'" % (buffer_joined))
	new_content.append(wrapper.fill(buffer_joined))
	if not text: # new paragraph
	info("* new paragraph")
	new_content.append(prefix)
	quote_level_prev = quote_level
	buffer = [text]
	info("NEW_CONTENT = %s" % (new_content))

	new_content = "\n".join(new_content) # TODO: remove first empty
	new_content = re.sub(r"\n\s*\n", "\n\n", new_content) # collapse empty
	return new_content


	def semantic_wrap(text):
	"""wrap quoted lines at terminal punctuations.

	...split('\n\n')...startswith('>') doesn't catch a non-quoted line
	followed by quoted lines of changing levels
	(e.g., beginning of email: "On DATE, NAME wrote:").

	"""

	new_content = []
	info("START")
	for line in re.split("\n\n", text):
	info("line = '%s'" % line)
	if line.startswith(">"):
	info("is quoted")
	quotes, line = QUOTES_RE.match(line).groups()
	line = line.replace(quotes, "")
	wrapped_line = SEMANTIC_BREAK_RE.sub(
	r"""\1\2\3\n%s""" % quotes, line
	)
	info("wrapped_line = '%s'" % wrapped_line)
	info("quotes = '%s'" % quotes)
	wrapped_line = quotes + wrapped_line + "\n"
	else:
	wrapped_line = SEMANTIC_BREAK_RE.sub(r"""\1\2\3\n""", line) + "\n"
	new_content.append(wrapped_line)
	return "\n".join(new_content)


	def main(argv):
	"""Process arguments"""
	arg_parser = argparse.ArgumentParser(
	description="a couple different types of line wrappers"
	)

	# positional arguments
	arg_parser.add_argument("files", nargs=1, metavar="FILE")
	# optional arguments
	arg_parser.add_argument(
	"-s",
	"--semantic",
	action="store_true",
	default=False,
	help="semantic wrap (at sentence endings)",
	)
	arg_parser.add_argument(
	"-w",
	"--wrap",
	type=int,
	default=72,
	help="wrap width (default: %(default)s)",
	)
	arg_parser.add_argument(
	"-o",
	"--out-filename",
	help="output results to filename",
	metavar="FILE",
	)
	arg_parser.add_argument(
	"-L",
	"--log-to-file",
	action="store_true",
	default=False,
	help="log to file %(prog)s.log",
	)
	arg_parser.add_argument(
	"-V",
	"--verbose",
	action="count",
	default=0,
	help="Increase verbosity (specify multiple times for more)",
	)
	arg_parser.add_argument("--version", action="version", version="TBD")
	args = arg_parser.parse_args(argv)

	log_level = 100 # default
	if args.verbose >= 3:
	log_level = logging.DEBUG # 10
	elif args.verbose == 2:
	log_level = logging.INFO # 20
	elif args.verbose == 1:
	log_level = logging.ERROR # 40
	LOG_FORMAT = "%(levelno)s %(funcName).5s: %(message)s"
	if args.log_to_file:
	logging.basicConfig(
	filename="wrap.log",
	filemode="w",
	level=log_level,
	format=LOG_FORMAT,
	)
	else:
	logging.basicConfig(level=log_level, format=LOG_FORMAT)

	return args


	if "__main__" == __name__:
	args = main(sys.argv[1:])
	content = open(args.files[0]).read()
	if args.semantic:
	print(semantic_wrap(content))
	else:
	print(quoted_wrap(content, args.wrap))