Skip to content

Instantly share code, notes, and snippets.

@luangong
Last active February 20, 2017 13:02
Show Gist options
  • Save luangong/6ccd4dbaa5c35a03118df85bdc5343f1 to your computer and use it in GitHub Desktop.
Save luangong/6ccd4dbaa5c35a03118df85bdc5343f1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import codecs
import regex
import sys
def join_cjk(s):
cjk = r'([\p{CJK Unified Ideographs}\p{Halfwidth and Fullwidth Forms}])'
pattern = cjk + r'\n *' + cjk
return regex.sub(pattern, r'\1\2', s, flags=regex.MULTILINE)
def main(argv):
stdin = codecs.getreader('utf-8')(sys.stdin.buffer)
s = join_cjk(stdin.read())
stdout = codecs.getwriter('utf-8')(sys.stdout.buffer)
stdout.write(s)
if __name__ == '__main__':
sys.exit(main(sys.argv))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import unittest
import join_cjk
class JoinCjkTest(unittest.TestCase):
def test_cjk_and_cjk(self):
self.assertEqual(join_cjk.join_cjk('中文\n测试'), '中文测试')
def test_cjk_and_latin(self):
self.assertEqual(join_cjk.join_cjk('你好\nhello'), '你好\nhello')
def test_latin_and_cjk(self):
self.assertEqual(join_cjk.join_cjk('hello\n你好'), 'hello\n你好')
def test_latin_and_latin(self):
self.assertEqual(join_cjk.join_cjk('hello\nworld'), 'hello\nworld')
def test_list_item(self):
self.assertEqual(join_cjk.join_cjk('1. 列表\n 缩进'), '1. 列表缩进')
def test_paragraph_in_list_item(self):
self.assertEqual(join_cjk.join_cjk(' 列表里的\n 独立段落'),
' 列表里的独立段落')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment