Skip to content

Instantly share code, notes, and snippets.

@tsuchm
Created December 24, 2023 02:59
Show Gist options
  • Save tsuchm/cb8e5a950317009501675fd2b5297562 to your computer and use it in GitHub Desktop.
Save tsuchm/cb8e5a950317009501675fd2b5297562 to your computer and use it in GitHub Desktop.
# Python implementation of https://aclanthology.org/P01-1064.pdf
#
# How to use:
# from textseg import textseg
# segments = textseg(texts)
from math import log2
import unittest
class TextSegParser():
__tagger = None
def __call__(self, text):
cls = type(self)
if not cls.__tagger:
import MeCab
cls.__tagger = MeCab.Tagger()
node = cls.__tagger.parseToNode(text)
nouns = []
while node:
features = node.feature.split(',')
if features[0] == '名詞' and features[1] != '非自立':
nouns.append(node.surface)
node = node.next
return nouns
class TextSegUnit():
def __init__(self, text, words):
vocab = dict()
for word in words:
vocab[word] = vocab.get(word, 0) + 1
self.text = text
self.vocab = vocab
self.length = len(words)
self.bestpath = 0
self.bestcost = 0
def merge(self, other):
self.length += other.length
for word, freq in other.vocab.items():
self.vocab[word] = self.vocab.get(word, 0) + freq
# Note that we need to use the total vocabulary size of input
# texts instead of the vocabulary size of each segment. If we use
# the vocabulary size of each segment, we obtain too small segments.
def cost(self, vocabsize):
cost = 0
for freq in self.vocab.values():
cost -= freq * log2(freq + 1)
if self.length > 0:
cost += self.length * log2(self.length + vocabsize)
cost += log2(self.length)
return cost
def textseg(texts, parser=TextSegParser()):
# calculate vocabulary size of input texts
inputs = [TextSegUnit(text, parser(text)) for text in texts]
vocab = dict()
for unit in inputs:
for word in unit.vocab.keys():
vocab[word] = 1
vocabsize = len(vocab.keys())
# forward search
units = []
for last in inputs:
for unit in units:
unit.merge(last)
units.append(last)
bestpath = 0
bestcost = float('inf')
for i, unit in enumerate(units):
cost = unit.cost(vocabsize)
if i > 0:
cost += units[i-1].bestcost
if cost < bestcost:
bestpath = i
bestcost = cost
last.bestpath = bestpath
last.bestcost = bestcost
# backward search
segments = []
while len(units) > 0:
last = units[-1]
segments.append([u.text for u in units[last.bestpath:]])
units = units[:last.bestpath]
segments.reverse()
return segments
class TextSegTest(unittest.TestCase):
def test_parser(self):
parser = TextSegParser()
result = parser("形態素解析は重要な処理である")
expected = ['形態素', '解析', '重要', '処理']
self.assertEqual(result, expected)
def test_unit(self):
parser = TextSegParser()
text = "ある単語と別の単語が繰り返し出現する"
unit = TextSegUnit(text, parser(text))
self.assertEqual(unit.text, text)
self.assertEqual(unit.length, 4)
self.assertEqual(len(unit.vocab.keys()), 3)
expected_cost = -2 * log2(3/7) -2 * log2(2/7) + log2(4)
self.assertAlmostEqual(unit.cost(3), expected_cost)
other_text = "他の単語が出現する"
other_unit = TextSegUnit(other_text, parser(other_text))
self.assertEqual(other_unit.text, other_text)
self.assertEqual(other_unit.length, 3)
self.assertEqual(len(other_unit.vocab.keys()), 3)
unit.merge(other_unit)
self.assertEqual(unit.length, 7)
self.assertEqual(len(unit.vocab.keys()), 4)
def test_textseg(self):
with open('samples/gijiroku.txt', 'r') as fp:
segments = textseg(fp)
self.assertEqual(len(segments), 10)
# COPYRIGHT
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment