Skip to content

Instantly share code, notes, and snippets.

@rrcgat
Created March 12, 2023 15:28
Show Gist options
  • Save rrcgat/c75159efb6ee5a0b50a36bb3291d1c3e to your computer and use it in GitHub Desktop.
Save rrcgat/c75159efb6ee5a0b50a36bb3291d1c3e to your computer and use it in GitHub Desktop.
Format input (include pipeline, clipboard and arguments) text to improve its readability, with a focus on Chinese text.
#!/usr/bin/env python
"""
Format input (include pipeline, clipboard and arguments) text
to improve its readability, with a focus on Chinese text.
"""
import platform
import re
import shutil
import subprocess
import sys
from os import isatty
from typing import Callable, List
BASIC_TRANSLATION_TABLE = str.maketrans("%#@&|1234567890", "%#@&|1234567890")
ENGLISH_TO_CHINESE_TABLE = str.maketrans(",.:;!?()", ",。:;!?()")
NON_SPACE_AROUND_PUNCTUATION = ",。:;!?、【】()「」"
NON_PREFIX_SPACE_PUNCTUATION_EN = ",.:;!?”’)"
NON_PREFIX_SPACE_PUNCTUATION = (
NON_PREFIX_SPACE_PUNCTUATION_EN + NON_SPACE_AROUND_PUNCTUATION
)
NON_SUFFIX_SPACE_PUNCTUATION = NON_SPACE_AROUND_PUNCTUATION
def remove_spaces_around_chinese(text: str):
"""Remove all spaces between Chinese characters
For example:
input: "你 好 世 界 , Hello world ! "
output: "你好世界 , Hello world ! "
"""
# NOTE: We need to perform two substitutions in this case, and here's why:
# our first pattern, ([\u4e00-\u9fff])\\s+([\u4e00-\u9fff]),
# matches all instances where a Chinese character is followed by spaces and ends with another Chinese character.
# If we have matched matchobj1 and matchobj2, and there are only spaces between them,
# making only one substitution will leave a space between matchobj1 and matchobj2.
# Therefore, we need to perform a second substitution.
# If there's a more simple solution, please let tell us.
text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+|\n)([\u4e00-\u9fff])", r"\1\3", text)
text = re.sub(f"([\u4e00-\u9fff])([ \u3000]+|\n)([\u4e00-\u9fff])", r"\1\3", text)
return text
def remove_redundant_sapces(text: str) -> str:
"""Remove unnecessary spaces around punctuation"""
# Remove leading spaces for some punctuation
text = re.sub(f" +([{NON_PREFIX_SPACE_PUNCTUATION}])", r"\1", text)
# Remove trailing space for some punctuation
text = re.sub(f"([{NON_SUFFIX_SPACE_PUNCTUATION}]) +", r"\1", text)
# Remove multiple leading spaces from some punctuation
text = re.sub(f"""([{NON_PREFIX_SPACE_PUNCTUATION_EN}#'"]) +""", r"\1 ", text)
# Remove spaces between `‘’“”` punctuation and Chinese character
text = re.sub("([\u4e00-\u9fff])( *)([‘’“”])", r"\1\3", text)
text = re.sub("([‘’“”])( *)([\u4e00-\u9fff])", r"\1\3", text)
return text.rstrip()
def _standardlize_repl(matchobj: re.Match):
"""Replace English punctuation to Chinese punctuation"""
return matchobj.group().translate(ENGLISH_TO_CHINESE_TABLE)
def fix_mixed_punctuation(text: str) -> str:
"""Convert English punctuation between Chinese characters to Chinese punctuation"""
# Basic translation
text = text.translate(BASIC_TRANSLATION_TABLE)
# Replace English punctuation to Chinese punctuation around Chinese characters or Chinese punctuation
text = re.sub("([\u4e00-\u9fff,。:;!?])(\\s*)([,.:;!?()])", _standardlize_repl, text)
# Another substitution for the case `你好: (世界)` and so on
text = re.sub("([\u4e00-\u9fff,。:;!?])(\\s*)([,.:;!?()])", _standardlize_repl, text)
return text
def format_text(text: str) -> str:
"""Format text using series of formatters"""
formatters: List[Callable[[str], str]] = [
fix_mixed_punctuation,
remove_redundant_sapces,
remove_spaces_around_chinese,
]
for f in formatters:
text = f(text)
return text
def get_clipboard_tk():
"""Access system clipboard using `tkinter`"""
import tkinter as tk
root = tk.Tk()
root.withdraw()
return root.clipboard_get()
def get_clipboard_macos() -> str:
"""Access macOS system clipboard
Use `pbpaste` command to capture output,
alternatively `osascript -e 'get the clipboard as text'`.
"""
return subprocess.getoutput("pbpaste")
def get_clipboard_linux() -> str:
"""Access Linux system clipboard
There's three tools we can use: `xsel`, `xclip` and `wl-paste`,
if none of them installed, we use tkinter as final choice.
"""
commands = {
"wl-paste": "wl-paste",
"xclip": "xclip -selection clipboard -o",
"xsel": "xsel -b",
}
for cmd in commands:
if shutil.which(cmd):
return subprocess.getoutput(commands[cmd])
return get_clipboard_tk()
def get_clipboard() -> str:
"""Access system clipboard context"""
if platform.system() == "Linux":
return get_clipboard_linux()
elif platform.system() == "Darwin":
return get_clipboard_macos()
else:
return get_clipboard_tk()
def main():
is_pipe = not isatty(sys.stdin.fileno())
if is_pipe:
text = sys.stdin.read()
elif len(sys.argv) > 1:
text = " ".join(sys.argv[1:])
else:
text = get_clipboard()
print(format_text(text))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment