rrcgat/format_text.py

## format_text.py
#!/usr/bin/env python
"""
Format input (include pipeline, clipboard and arguments) text
to improve its readability, with a focus on Chinese text.
"""
import platform
import re
import shutil
import subprocess
import sys
from os import isatty
from typing import Callable, List

BASIC_TRANSLATION_TABLE = str.maketrans("％＃＠＆｜１２３４５６７８９０", "%#@&|1234567890")

ENGLISH_TO_CHINESE_TABLE = str.maketrans(",.:;!?()", "，。：；！？（）")

NON_SPACE_AROUND_PUNCTUATION = "，。：；！？、【】（）「」"

NON_PREFIX_SPACE_PUNCTUATION_EN = ",.:;!?”’)"

NON_PREFIX_SPACE_PUNCTUATION = (
    NON_PREFIX_SPACE_PUNCTUATION_EN + NON_SPACE_AROUND_PUNCTUATION
)
NON_SUFFIX_SPACE_PUNCTUATION = NON_SPACE_AROUND_PUNCTUATION


def remove_spaces_around_chinese(text: str):
    """Remove all spaces between Chinese characters
    For example:
        input: "你 好 世 界 , Hello world ! "
        output: "你好世界 , Hello world ! "
    """
    # NOTE: We need to perform two substitutions in this case, and here's why:
    # our first pattern, ([\u4e00-\u9fff])\\s+([\u4e00-\u9fff]),
    # matches all instances where a Chinese character is followed by spaces and ends with another Chinese character.
    # If we have matched matchobj1 and matchobj2, and there are only spaces between them,
    # making only one substitution will leave a space between matchobj1 and matchobj2.
    # Therefore, we need to perform a second substitution.
    # If there's a more simple solution, please let tell us.
    text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+|\n)([\u4e00-\u9fff])", r"\1\3", text)
    text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+|\n)([\u4e00-\u9fff])", r"\1\3", text)

    return text


def remove_redundant_sapces(text: str) -> str:
    """Remove unnecessary spaces around punctuation"""
    # Remove leading spaces for some punctuation
    text = re.sub(f" +([{NON_PREFIX_SPACE_PUNCTUATION}])", r"\1", text)

    # Remove trailing space for some punctuation
    text = re.sub(f"([{NON_SUFFIX_SPACE_PUNCTUATION}]) +", r"\1", text)

    # Remove multiple leading spaces from some punctuation
    text = re.sub(f"""([{NON_PREFIX_SPACE_PUNCTUATION_EN}#'"]) +""", r"\1 ", text)

    # Remove spaces between `‘’“”` punctuation and Chinese character
    text = re.sub("([\u4e00-\u9fff])( *)([‘’“”])", r"\1\3", text)
    text = re.sub("([‘’“”])( *)([\u4e00-\u9fff])", r"\1\3", text)

    return text.rstrip()


def _standardlize_repl(matchobj: re.Match):
    """Replace English punctuation to Chinese punctuation"""
    return matchobj.group().translate(ENGLISH_TO_CHINESE_TABLE)


def fix_mixed_punctuation(text: str) -> str:
    """Convert English punctuation between Chinese characters to Chinese punctuation"""
    # Basic translation
    text = text.translate(BASIC_TRANSLATION_TABLE)
    # Replace English punctuation to Chinese punctuation around Chinese characters or Chinese punctuation
    text = re.sub("([\u4e00-\u9fff，。：；！？])(\\s*)([,.:;!?()])", _standardlize_repl, text)
    # Another substitution for the case `你好: (世界)` and so on
    text = re.sub("([\u4e00-\u9fff，。：；！？])(\\s*)([,.:;!?()])", _standardlize_repl, text)

    return text


def format_text(text: str) -> str:
    """Format text using series of formatters"""
    formatters: List[Callable[[str], str]] = [
        fix_mixed_punctuation,
        remove_redundant_sapces,
        remove_spaces_around_chinese,
    ]
    for f in formatters:
        text = f(text)
    return text


def get_clipboard_tk():
    """Access system clipboard using `tkinter`"""
    import tkinter as tk

    root = tk.Tk()
    root.withdraw()
    return root.clipboard_get()


def get_clipboard_macos() -> str:
    """Access macOS system clipboard

    Use `pbpaste` command to capture output,
    alternatively `osascript -e 'get the clipboard as text'`.
    """
    return subprocess.getoutput("pbpaste")


def get_clipboard_linux() -> str:
    """Access Linux system clipboard

    There's three tools we can use: `xsel`, `xclip` and `wl-paste`,
    if none of them installed, we use tkinter as final choice.
    """
    commands = {
        "wl-paste": "wl-paste",
        "xclip": "xclip -selection clipboard -o",
        "xsel": "xsel -b",
    }
    for cmd in commands:
        if shutil.which(cmd):
            return subprocess.getoutput(commands[cmd])

    return get_clipboard_tk()


def get_clipboard() -> str:
    """Access system clipboard context"""
    if platform.system() == "Linux":
        return get_clipboard_linux()
    elif platform.system() == "Darwin":
        return get_clipboard_macos()
    else:
        return get_clipboard_tk()


def main():
    is_pipe = not isatty(sys.stdin.fileno())
    if is_pipe:
        text = sys.stdin.read()
    elif len(sys.argv) > 1:
        text = " ".join(sys.argv[1:])
    else:
        text = get_clipboard()
    print(format_text(text))


if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	"""
	Format input (include pipeline, clipboard and arguments) text
	to improve its readability, with a focus on Chinese text.
	"""
	import platform
	import re
	import shutil
	import subprocess
	import sys
	from os import isatty
	from typing import Callable, List

	BASIC_TRANSLATION_TABLE = str.maketrans("％＃＠＆｜１２３４５６７８９０", "%#@&\|1234567890")

	ENGLISH_TO_CHINESE_TABLE = str.maketrans(",.:;!?()", "，。：；！？（）")

	NON_SPACE_AROUND_PUNCTUATION = "，。：；！？、【】（）「」"

	NON_PREFIX_SPACE_PUNCTUATION_EN = ",.:;!?”’)"

	NON_PREFIX_SPACE_PUNCTUATION = (
	NON_PREFIX_SPACE_PUNCTUATION_EN + NON_SPACE_AROUND_PUNCTUATION
	)
	NON_SUFFIX_SPACE_PUNCTUATION = NON_SPACE_AROUND_PUNCTUATION


	def remove_spaces_around_chinese(text: str):
	"""Remove all spaces between Chinese characters
	For example:
	input: "你好世界 , Hello world ! "
	output: "你好世界 , Hello world ! "
	"""
	# NOTE: We need to perform two substitutions in this case, and here's why:
	# our first pattern, ([\u4e00-\u9fff])\\s+([\u4e00-\u9fff]),
	# matches all instances where a Chinese character is followed by spaces and ends with another Chinese character.
	# If we have matched matchobj1 and matchobj2, and there are only spaces between them,
	# making only one substitution will leave a space between matchobj1 and matchobj2.
	# Therefore, we need to perform a second substitution.
	# If there's a more simple solution, please let tell us.
	text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+\|\n)([\u4e00-\u9fff])", r"\1\3", text)
	text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+\|\n)([\u4e00-\u9fff])", r"\1\3", text)

	return text


	def remove_redundant_sapces(text: str) -> str:
	"""Remove unnecessary spaces around punctuation"""
	# Remove leading spaces for some punctuation
	text = re.sub(f" +([{NON_PREFIX_SPACE_PUNCTUATION}])", r"\1", text)

	# Remove trailing space for some punctuation
	text = re.sub(f"([{NON_SUFFIX_SPACE_PUNCTUATION}]) +", r"\1", text)

	# Remove multiple leading spaces from some punctuation
	text = re.sub(f"""([{NON_PREFIX_SPACE_PUNCTUATION_EN}#'"]) +""", r"\1 ", text)

	# Remove spaces between `‘’“”` punctuation and Chinese character
	text = re.sub("([\u4e00-\u9fff])( *)([‘’“”])", r"\1\3", text)
	text = re.sub("([‘’“”])( *)([\u4e00-\u9fff])", r"\1\3", text)

	return text.rstrip()


	def _standardlize_repl(matchobj: re.Match):
	"""Replace English punctuation to Chinese punctuation"""
	return matchobj.group().translate(ENGLISH_TO_CHINESE_TABLE)


	def fix_mixed_punctuation(text: str) -> str:
	"""Convert English punctuation between Chinese characters to Chinese punctuation"""
	# Basic translation
	text = text.translate(BASIC_TRANSLATION_TABLE)
	# Replace English punctuation to Chinese punctuation around Chinese characters or Chinese punctuation
	text = re.sub("([\u4e00-\u9fff，。：；！？])(\\s*)([,.:;!?()])", _standardlize_repl, text)
	# Another substitution for the case `你好: (世界)` and so on
	text = re.sub("([\u4e00-\u9fff，。：；！？])(\\s*)([,.:;!?()])", _standardlize_repl, text)

	return text


	def format_text(text: str) -> str:
	"""Format text using series of formatters"""
	formatters: List[Callable[[str], str]] = [
	fix_mixed_punctuation,
	remove_redundant_sapces,
	remove_spaces_around_chinese,
	]
	for f in formatters:
	text = f(text)
	return text


	def get_clipboard_tk():
	"""Access system clipboard using `tkinter`"""
	import tkinter as tk

	root = tk.Tk()
	root.withdraw()
	return root.clipboard_get()


	def get_clipboard_macos() -> str:
	"""Access macOS system clipboard

	Use `pbpaste` command to capture output,
	alternatively `osascript -e 'get the clipboard as text'`.
	"""
	return subprocess.getoutput("pbpaste")


	def get_clipboard_linux() -> str:
	"""Access Linux system clipboard

	There's three tools we can use: `xsel`, `xclip` and `wl-paste`,
	if none of them installed, we use tkinter as final choice.
	"""
	commands = {
	"wl-paste": "wl-paste",
	"xclip": "xclip -selection clipboard -o",
	"xsel": "xsel -b",
	}
	for cmd in commands:
	if shutil.which(cmd):
	return subprocess.getoutput(commands[cmd])

	return get_clipboard_tk()


	def get_clipboard() -> str:
	"""Access system clipboard context"""
	if platform.system() == "Linux":
	return get_clipboard_linux()
	elif platform.system() == "Darwin":
	return get_clipboard_macos()
	else:
	return get_clipboard_tk()


	def main():
	is_pipe = not isatty(sys.stdin.fileno())
	if is_pipe:
	text = sys.stdin.read()
	elif len(sys.argv) > 1:
	text = " ".join(sys.argv[1:])
	else:
	text = get_clipboard()
	print(format_text(text))


	if __name__ == "__main__":
	main()