chwnam/win_demacboogi.py

## win_demacboogi.py
#!/usr/bin/python
# -*- coding: cp949 -*-
"""
자모 분리된 문자열로부터 원래의 문자열로 복원하는 파이썬 스크립트.
리눅스 <-> OSX의 경우 파일 인코딩이 UTF-8이 유지되기 때문에 단지 NFD, NFC만 바꾸어 주면 되나,
윈도우의 경우 CP949로 완전히 인코딩이 변경되어 UTF-8 하의 NFD, NFC 정보가 손실된다.
그러므로 이렇게 손실된 정보를 자모의 출현 규칙으로부터 파악해 핸다.
"""
from unicodedata import normalize
import sys
import os

chosung = [u'ㄱ', u'ㄲ', u'ㄴ', u'ㄷ', u'ㄸ', u'ㄹ', u'ㅁ', u'ㅂ', u'ㅃ', u'ㅅ', u'ㅆ', u'ㅇ', u'ㅈ', u'ㅉ', u'ㅊ', u'ㅋ', u'ㅌ', u'ㅍ', u'ㅎ'];
jungsong = [u'ㅏ', u'ㅐ', u'ㅑ', u'ㅒ', u'ㅓ', u'ㅔ', u'ㅕ', u'ㅖ', u'ㅗ', u'ㅘ', u'ㅙ', u'ㅚ', u'ㅛ', u'ㅜ', u'ㅝ', u'ㅞ', u'ㅟ', u'ㅠ', u'ㅡ', u'ㅢ', u'ㅣ']
jongsong = [u' ', u'ㄱ', u'ㄲ', u'ㄳ', u'ㄴ', u'ㄵ', u'ㄶ', u'ㄷ', u'ㄹ', u'ㄺ', u'ㄻ', u'ㄼ', u'ㄽ', u'ㄾ', u'ㄿ', u'ㅀ', u'ㅁ', u'ㅂ', u'ㅄ', u'ㅅ', u'ㅆ', u'ㅇ', u'ㅈ', u'ㅊ', u'ㅋ', u'ㅌ', u'ㅍ', u'ㅎ']


def normalize_str(form, str, encoding):
	return normalize(form, str.decode(encoding))


def jamo_input(input_string):
	"""
	input_string으로부터 각 자모를 찾고 자모일 경우 인덱스를 찾아 튜플로 저장한다.
	"""
	idx = 0
	strlen = len(input_string)
	result = []
	buf = []
	jungsong_detected = False

	while(idx < strlen):
		jamo_org = input_string[idx]
		jamo = jamo_org.strip()  # 공백이 종성 0번째로 인식되는 것을 막는다.

		if jungsong_detected:
			# 중성이 찾아진 다음, 지금 글자 다음 글자가 중성이면 지금 글자는 초성으로 간주
			if idx + 1 < strlen and input_string[idx + 1] in jungsong:
				is_chos = jamo in chosung
			# 지금 글자 다음이 중성이 아니면 지금 글자는 종성으로 간주
			else:
				is_chos = False
		else:
			is_chos = jamo in chosung

		if is_chos:
			is_jung = False
			is_jong = False
		else:
			is_jung = jamo in jungsong
			is_jong = jamo in jongsong

		if is_chos or is_jung or is_jong:
			if is_chos:
				#print 'jamo %c is chosung' % (jamo, )
				if len(buf) == 0:
					buf = [chosung.index(jamo), ]
				else:
					result.append(buf)
					buf = [chosung.index(jamo), ]

				jungsong_detected = False

			if is_jung:
				#print 'jamo %c is jungsong' % (jamo, )
				if len(buf) == 1:
					buf.append(jungsong.index(jamo))
				else:
					if buf:
						result.append(buf)
					buf = [-1, jungsong.index(jamo), ]

				jungsong_detected = True

			if is_jong:
				#print 'jamo %c is jongsong' % (jamo, )
				if len(buf) != 2:
					if buf:
						result.append(buf)
					buf = [-1, -1]
				buf.append(jongsong.index(jamo))
				result.append(buf)
				buf = []

				jungsong_detected = False

		else:
			if buf:
				result.append(buf)
				buf = []
			result.append(jamo_org)

		idx += 1

	#print result

	# result 요소 중 list인 것을 모두 길이 3인 tuple로 치환하고 없는 요소는 -1로 만든다.
	for i in xrange(len(result)):
		r = result[i]
		if type(r) is list:
			k = [-1, -1, -1]
			for j in xrange(len(r)):
				k[j] = r[j]
			result[i] = tuple(k)

	return result


def build_string_from_indices(jamo_indices):

	result_string = u''

	for idx in jamo_indices:

		if type(idx) is tuple:

			chosung_index = idx[0] if idx[0] > -1 else False
			jungsong_index = idx[1] if idx[1] > -1 else False
			jongsong_index = idx[2] if idx[2] > -1 else False
			char_code = 0xac00;

			if chosung_index:
				char_code += 28 * 21 * chosung_index

			if jungsong_index:
				char_code += 28 * jungsong_index

			if jongsong_index:
				char_code += jongsong_index

			#print hex(char_code), unichr(char_code)
			result_string += unichr(char_code)
		else:
			result_string += idx

	return result_string


if __name__ == '__main__':
	name_inputs = sys.argv[1:]
	for n in name_inputs:
		jamo_indices = jamo_input(n.decode('cp949'))
		result = build_string_from_indices(jamo_indices)
		print "Rename \"%s\" to \"%s\"" % (n, result.encode('cp949'))
		os.rename(n, result)
	#!/usr/bin/python
	# -- coding: cp949 --
	"""
	자모 분리된 문자열로부터 원래의 문자열로 복원하는 파이썬 스크립트.
	리눅스 <-> OSX의 경우 파일 인코딩이 UTF-8이 유지되기 때문에 단지 NFD, NFC만 바꾸어 주면 되나,
	윈도우의 경우 CP949로 완전히 인코딩이 변경되어 UTF-8 하의 NFD, NFC 정보가 손실된다.
	그러므로 이렇게 손실된 정보를 자모의 출현 규칙으로부터 파악해 핸다.
	"""
	from unicodedata import normalize
	import sys
	import os

	chosung = [u'ㄱ', u'ㄲ', u'ㄴ', u'ㄷ', u'ㄸ', u'ㄹ', u'ㅁ', u'ㅂ', u'ㅃ', u'ㅅ', u'ㅆ', u'ㅇ', u'ㅈ', u'ㅉ', u'ㅊ', u'ㅋ', u'ㅌ', u'ㅍ', u'ㅎ'];
	jungsong = [u'ㅏ', u'ㅐ', u'ㅑ', u'ㅒ', u'ㅓ', u'ㅔ', u'ㅕ', u'ㅖ', u'ㅗ', u'ㅘ', u'ㅙ', u'ㅚ', u'ㅛ', u'ㅜ', u'ㅝ', u'ㅞ', u'ㅟ', u'ㅠ', u'ㅡ', u'ㅢ', u'ㅣ']
	jongsong = [u' ', u'ㄱ', u'ㄲ', u'ㄳ', u'ㄴ', u'ㄵ', u'ㄶ', u'ㄷ', u'ㄹ', u'ㄺ', u'ㄻ', u'ㄼ', u'ㄽ', u'ㄾ', u'ㄿ', u'ㅀ', u'ㅁ', u'ㅂ', u'ㅄ', u'ㅅ', u'ㅆ', u'ㅇ', u'ㅈ', u'ㅊ', u'ㅋ', u'ㅌ', u'ㅍ', u'ㅎ']


	def normalize_str(form, str, encoding):
	return normalize(form, str.decode(encoding))


	def jamo_input(input_string):
	"""
	input_string으로부터 각 자모를 찾고 자모일 경우 인덱스를 찾아 튜플로 저장한다.
	"""
	idx = 0
	strlen = len(input_string)
	result = []
	buf = []
	jungsong_detected = False

	while(idx < strlen):
	jamo_org = input_string[idx]
	jamo = jamo_org.strip() # 공백이 종성 0번째로 인식되는 것을 막는다.

	if jungsong_detected:
	# 중성이 찾아진 다음, 지금 글자 다음 글자가 중성이면 지금 글자는 초성으로 간주
	if idx + 1 < strlen and input_string[idx + 1] in jungsong:
	is_chos = jamo in chosung
	# 지금 글자 다음이 중성이 아니면 지금 글자는 종성으로 간주
	else:
	is_chos = False
	else:
	is_chos = jamo in chosung

	if is_chos:
	is_jung = False
	is_jong = False
	else:
	is_jung = jamo in jungsong
	is_jong = jamo in jongsong

	if is_chos or is_jung or is_jong:
	if is_chos:
	#print 'jamo %c is chosung' % (jamo, )
	if len(buf) == 0:
	buf = [chosung.index(jamo), ]
	else:
	result.append(buf)
	buf = [chosung.index(jamo), ]

	jungsong_detected = False

	if is_jung:
	#print 'jamo %c is jungsong' % (jamo, )
	if len(buf) == 1:
	buf.append(jungsong.index(jamo))
	else:
	if buf:
	result.append(buf)
	buf = [-1, jungsong.index(jamo), ]

	jungsong_detected = True

	if is_jong:
	#print 'jamo %c is jongsong' % (jamo, )
	if len(buf) != 2:
	if buf:
	result.append(buf)
	buf = [-1, -1]
	buf.append(jongsong.index(jamo))
	result.append(buf)
	buf = []

	jungsong_detected = False

	else:
	if buf:
	result.append(buf)
	buf = []
	result.append(jamo_org)

	idx += 1

	#print result

	# result 요소 중 list인 것을 모두 길이 3인 tuple로 치환하고 없는 요소는 -1로 만든다.
	for i in xrange(len(result)):
	r = result[i]
	if type(r) is list:
	k = [-1, -1, -1]
	for j in xrange(len(r)):
	k[j] = r[j]
	result[i] = tuple(k)

	return result


	def build_string_from_indices(jamo_indices):

	result_string = u''

	for idx in jamo_indices:

	if type(idx) is tuple:

	chosung_index = idx[0] if idx[0] > -1 else False
	jungsong_index = idx[1] if idx[1] > -1 else False
	jongsong_index = idx[2] if idx[2] > -1 else False
	char_code = 0xac00;

	if chosung_index:
	char_code += 28 * 21 * chosung_index

	if jungsong_index:
	char_code += 28 * jungsong_index

	if jongsong_index:
	char_code += jongsong_index

	#print hex(char_code), unichr(char_code)
	result_string += unichr(char_code)
	else:
	result_string += idx

	return result_string


	if __name__ == '__main__':
	name_inputs = sys.argv[1:]
	for n in name_inputs:
	jamo_indices = jamo_input(n.decode('cp949'))
	result = build_string_from_indices(jamo_indices)
	print "Rename \"%s\" to \"%s\"" % (n, result.encode('cp949'))
	os.rename(n, result)