Created
March 12, 2023 15:28
-
-
Save rrcgat/c75159efb6ee5a0b50a36bb3291d1c3e to your computer and use it in GitHub Desktop.
Format input (include pipeline, clipboard and arguments) text to improve its readability, with a focus on Chinese text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Format input (include pipeline, clipboard and arguments) text | |
to improve its readability, with a focus on Chinese text. | |
""" | |
import platform | |
import re | |
import shutil | |
import subprocess | |
import sys | |
from os import isatty | |
from typing import Callable, List | |
BASIC_TRANSLATION_TABLE = str.maketrans("%#@&|1234567890", "%#@&|1234567890") | |
ENGLISH_TO_CHINESE_TABLE = str.maketrans(",.:;!?()", ",。:;!?()") | |
NON_SPACE_AROUND_PUNCTUATION = ",。:;!?、【】()「」" | |
NON_PREFIX_SPACE_PUNCTUATION_EN = ",.:;!?”’)" | |
NON_PREFIX_SPACE_PUNCTUATION = ( | |
NON_PREFIX_SPACE_PUNCTUATION_EN + NON_SPACE_AROUND_PUNCTUATION | |
) | |
NON_SUFFIX_SPACE_PUNCTUATION = NON_SPACE_AROUND_PUNCTUATION | |
def remove_spaces_around_chinese(text: str): | |
"""Remove all spaces between Chinese characters | |
For example: | |
input: "你 好 世 界 , Hello world ! " | |
output: "你好世界 , Hello world ! " | |
""" | |
# NOTE: We need to perform two substitutions in this case, and here's why: | |
# our first pattern, ([\u4e00-\u9fff])\\s+([\u4e00-\u9fff]), | |
# matches all instances where a Chinese character is followed by spaces and ends with another Chinese character. | |
# If we have matched matchobj1 and matchobj2, and there are only spaces between them, | |
# making only one substitution will leave a space between matchobj1 and matchobj2. | |
# Therefore, we need to perform a second substitution. | |
# If there's a more simple solution, please let tell us. | |
text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+|\n)([\u4e00-\u9fff])", r"\1\3", text) | |
text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+|\n)([\u4e00-\u9fff])", r"\1\3", text) | |
return text | |
def remove_redundant_sapces(text: str) -> str: | |
"""Remove unnecessary spaces around punctuation""" | |
# Remove leading spaces for some punctuation | |
text = re.sub(f" +([{NON_PREFIX_SPACE_PUNCTUATION}])", r"\1", text) | |
# Remove trailing space for some punctuation | |
text = re.sub(f"([{NON_SUFFIX_SPACE_PUNCTUATION}]) +", r"\1", text) | |
# Remove multiple leading spaces from some punctuation | |
text = re.sub(f"""([{NON_PREFIX_SPACE_PUNCTUATION_EN}#'"]) +""", r"\1 ", text) | |
# Remove spaces between `‘’“”` punctuation and Chinese character | |
text = re.sub("([\u4e00-\u9fff])( *)([‘’“”])", r"\1\3", text) | |
text = re.sub("([‘’“”])( *)([\u4e00-\u9fff])", r"\1\3", text) | |
return text.rstrip() | |
def _standardlize_repl(matchobj: re.Match): | |
"""Replace English punctuation to Chinese punctuation""" | |
return matchobj.group().translate(ENGLISH_TO_CHINESE_TABLE) | |
def fix_mixed_punctuation(text: str) -> str: | |
"""Convert English punctuation between Chinese characters to Chinese punctuation""" | |
# Basic translation | |
text = text.translate(BASIC_TRANSLATION_TABLE) | |
# Replace English punctuation to Chinese punctuation around Chinese characters or Chinese punctuation | |
text = re.sub("([\u4e00-\u9fff,。:;!?])(\\s*)([,.:;!?()])", _standardlize_repl, text) | |
# Another substitution for the case `你好: (世界)` and so on | |
text = re.sub("([\u4e00-\u9fff,。:;!?])(\\s*)([,.:;!?()])", _standardlize_repl, text) | |
return text | |
def format_text(text: str) -> str: | |
"""Format text using series of formatters""" | |
formatters: List[Callable[[str], str]] = [ | |
fix_mixed_punctuation, | |
remove_redundant_sapces, | |
remove_spaces_around_chinese, | |
] | |
for f in formatters: | |
text = f(text) | |
return text | |
def get_clipboard_tk(): | |
"""Access system clipboard using `tkinter`""" | |
import tkinter as tk | |
root = tk.Tk() | |
root.withdraw() | |
return root.clipboard_get() | |
def get_clipboard_macos() -> str: | |
"""Access macOS system clipboard | |
Use `pbpaste` command to capture output, | |
alternatively `osascript -e 'get the clipboard as text'`. | |
""" | |
return subprocess.getoutput("pbpaste") | |
def get_clipboard_linux() -> str: | |
"""Access Linux system clipboard | |
There's three tools we can use: `xsel`, `xclip` and `wl-paste`, | |
if none of them installed, we use tkinter as final choice. | |
""" | |
commands = { | |
"wl-paste": "wl-paste", | |
"xclip": "xclip -selection clipboard -o", | |
"xsel": "xsel -b", | |
} | |
for cmd in commands: | |
if shutil.which(cmd): | |
return subprocess.getoutput(commands[cmd]) | |
return get_clipboard_tk() | |
def get_clipboard() -> str: | |
"""Access system clipboard context""" | |
if platform.system() == "Linux": | |
return get_clipboard_linux() | |
elif platform.system() == "Darwin": | |
return get_clipboard_macos() | |
else: | |
return get_clipboard_tk() | |
def main(): | |
is_pipe = not isatty(sys.stdin.fileno()) | |
if is_pipe: | |
text = sys.stdin.read() | |
elif len(sys.argv) > 1: | |
text = " ".join(sys.argv[1:]) | |
else: | |
text = get_clipboard() | |
print(format_text(text)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment