Last active
February 20, 2017 13:02
-
-
Save luangong/6ccd4dbaa5c35a03118df85bdc5343f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import codecs | |
import regex | |
import sys | |
def join_cjk(s): | |
cjk = r'([\p{CJK Unified Ideographs}\p{Halfwidth and Fullwidth Forms}])' | |
pattern = cjk + r'\n *' + cjk | |
return regex.sub(pattern, r'\1\2', s, flags=regex.MULTILINE) | |
def main(argv): | |
stdin = codecs.getreader('utf-8')(sys.stdin.buffer) | |
s = join_cjk(stdin.read()) | |
stdout = codecs.getwriter('utf-8')(sys.stdout.buffer) | |
stdout.write(s) | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import unittest | |
import join_cjk | |
class JoinCjkTest(unittest.TestCase): | |
def test_cjk_and_cjk(self): | |
self.assertEqual(join_cjk.join_cjk('中文\n测试'), '中文测试') | |
def test_cjk_and_latin(self): | |
self.assertEqual(join_cjk.join_cjk('你好\nhello'), '你好\nhello') | |
def test_latin_and_cjk(self): | |
self.assertEqual(join_cjk.join_cjk('hello\n你好'), 'hello\n你好') | |
def test_latin_and_latin(self): | |
self.assertEqual(join_cjk.join_cjk('hello\nworld'), 'hello\nworld') | |
def test_list_item(self): | |
self.assertEqual(join_cjk.join_cjk('1. 列表\n 缩进'), '1. 列表缩进') | |
def test_paragraph_in_list_item(self): | |
self.assertEqual(join_cjk.join_cjk(' 列表里的\n 独立段落'), | |
' 列表里的独立段落') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment