cjnghn/1_preprocessing.py

## 1_preprocessing.py
"""Tokenizer()

토큰화와 정수 인코딩(단어에 대한 인덱싱)을 위해 사용됩니다.
"""

from tensorflow.preprocessing.text import Tokenizer

t = Tokenizer()
fit_text = "The earth is an awesome place to live"
t.fit_on_texts([fit_text])

test_text = "The earth is an great place to live"
seq = t.text_to_sequence([test_text])[0]

print("sequences: ", seq)
    # great는 단어 집합(V)에 없으므로 출력되지 않는다.
print("word index: ", t.word_index)
    # 단어 집합(vocabulary) 출력

"""
sequences :  [1, 2, 3, 4, 6, 7]
word_index :  {'the': 1, 'earth': 2, 'is': 3, 'an': 4, 'awesome': 5, 'place': 6, 'live': 7}
"""

## 2_pad_sequence.py
"""pad_sequence()

전체 훈련데이터에서 각 샘플의 길이는 서로 다를 수 있습니다.
각 문서 또는 각 문장은 단어의 수가 제각각 입니다.

모델의 입력으로 사용하려면 모든 샘플의 길이를 동일하게 맞추어야할 때가 있습니다.
이를 자연어 처리에서는 패딩(padding) 작업이라고 하는데,
보통 숫자 0 을 넣어서 길이가 다른 샘플들의 길이를 맞춰줍니다.
"""

from tensorflow.keras.preprocessing.sequence import pad_sequences

pad_sequences([[1,2,3], [3,4,5,6], [7,8]], maxlen=3, padding='pre')

"""
array([[1, 2, 3],
       [4, 5, 6],
       [0, 7, 8]], dtype=int32)
"""
	"""Tokenizer()

	토큰화와 정수 인코딩(단어에 대한 인덱싱)을 위해 사용됩니다.
	"""

	from tensorflow.preprocessing.text import Tokenizer

	t = Tokenizer()
	fit_text = "The earth is an awesome place to live"
	t.fit_on_texts([fit_text])

	test_text = "The earth is an great place to live"
	seq = t.text_to_sequence([test_text])[0]

	print("sequences: ", seq)
	# great는 단어 집합(V)에 없으므로 출력되지 않는다.
	print("word index: ", t.word_index)
	# 단어 집합(vocabulary) 출력

	"""
	sequences : [1, 2, 3, 4, 6, 7]
	word_index : {'the': 1, 'earth': 2, 'is': 3, 'an': 4, 'awesome': 5, 'place': 6, 'live': 7}
	"""
	"""pad_sequence()

	전체 훈련데이터에서 각 샘플의 길이는 서로 다를 수 있습니다.
	각 문서 또는 각 문장은 단어의 수가 제각각 입니다.

	모델의 입력으로 사용하려면 모든 샘플의 길이를 동일하게 맞추어야할 때가 있습니다.
	이를 자연어 처리에서는 패딩(padding) 작업이라고 하는데,
	보통 숫자 0 을 넣어서 길이가 다른 샘플들의 길이를 맞춰줍니다.
	"""

	from tensorflow.keras.preprocessing.sequence import pad_sequences

	pad_sequences([[1,2,3], [3,4,5,6], [7,8]], maxlen=3, padding='pre')

	"""
	array([[1, 2, 3],
	[4, 5, 6],
	[0, 7, 8]], dtype=int32)
	"""