lcpz/sp.py

## sp.py
#! /usr/bin/env python

# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# original: https://goo.gl/kkF5SY

"""Check whether a given variable-length code is uniquely decodable.
This is a direct/naive implementation of the Sardinas-Patterson algorithm.
It can be used to check if e.g. a given phoneme inventory yields unambiguous
transcriptions.
"""

def LeftQuotientOfWord(ps, w):
    """Yields the suffixes of w after removing any prefix in ps."""
    for p in ps:
        if w.startswith(p):
            yield w[len(p):]
    return

def LeftQuotient(ps, ws):
    """Returns the set of suffixes of any word in ws after removing any prefix
    in ps. This is the quotient set which results from dividing ws on the
    left by ps."""
    qs = set()
    for w in ws:
        for q in LeftQuotientOfWord(ps, w):
            qs.add(q)
    return qs

def IsUniquelyDecodable(cs):
    """Checks if the set of codewords cs is uniquely decodable via the
    Sardinas-Patterson algorithm."""
    NL, i = len(str(cs)) * len(str(max(len(x) for x in cs))), 1 # Levenstein's upper bound for termination
    s = LeftQuotient(cs, cs)
    s.discard('')
    if len(s) == 0:
        print('Uniquely decodable prefix code.')
        return True
    while '' not in s and len(s & cs) == 0:
        t = LeftQuotient(cs, s) | LeftQuotient(s, cs)
        if t == s or i > NL + 1:
            print('Uniquely decodable.')
            return True
        s = t
        i += 1
    if '' in s:
        print('Dangling empty suffix.')
    for x in s & cs:
        print('Dangling suffix: {}'.format(x))
    return False

# TODO: add support for power and star operators, for instance: a^2, (ab)^n, a^*b, (ab)^*
if __name__ == '__main__':
    """Usage:
       1.a $ python sp.py
       1.b input code words, one per line (hit enter)
       1.c hit ctrl+d
       2   $ python sp.py code.txt
           where code.txt contains one code word per line
    """
    import sys

    cs = set()
    if len(sys.argv) > 1:
        with open(sys.argv[1]) as reader:
            for line in reader.readlines():
                for c in line.split():
                    cs.add(c)
    else:
        for line in sys.stdin.readlines():
            for c in line.split():
                cs.add(c)

    sys.exit(0 if IsUniquelyDecodable(cs) else 1)
	#! /usr/bin/env python

	# Copyright 2015 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# original: https://goo.gl/kkF5SY

	"""Check whether a given variable-length code is uniquely decodable.
	This is a direct/naive implementation of the Sardinas-Patterson algorithm.
	It can be used to check if e.g. a given phoneme inventory yields unambiguous
	transcriptions.
	"""

	def LeftQuotientOfWord(ps, w):
	"""Yields the suffixes of w after removing any prefix in ps."""
	for p in ps:
	if w.startswith(p):
	yield w[len(p):]
	return

	def LeftQuotient(ps, ws):
	"""Returns the set of suffixes of any word in ws after removing any prefix
	in ps. This is the quotient set which results from dividing ws on the
	left by ps."""
	qs = set()
	for w in ws:
	for q in LeftQuotientOfWord(ps, w):
	qs.add(q)
	return qs

	def IsUniquelyDecodable(cs):
	"""Checks if the set of codewords cs is uniquely decodable via the
	Sardinas-Patterson algorithm."""
	NL, i = len(str(cs)) * len(str(max(len(x) for x in cs))), 1 # Levenstein's upper bound for termination
	s = LeftQuotient(cs, cs)
	s.discard('')
	if len(s) == 0:
	print('Uniquely decodable prefix code.')
	return True
	while '' not in s and len(s & cs) == 0:
	t = LeftQuotient(cs, s) \| LeftQuotient(s, cs)
	if t == s or i > NL + 1:
	print('Uniquely decodable.')
	return True
	s = t
	i += 1
	if '' in s:
	print('Dangling empty suffix.')
	for x in s & cs:
	print('Dangling suffix: {}'.format(x))
	return False

	# TODO: add support for power and star operators, for instance: a^2, (ab)^n, a^b, (ab)^
	if __name__ == '__main__':
	"""Usage:
	1.a $ python sp.py
	1.b input code words, one per line (hit enter)
	1.c hit ctrl+d
	2 $ python sp.py code.txt
	where code.txt contains one code word per line
	"""
	import sys

	cs = set()
	if len(sys.argv) > 1:
	with open(sys.argv[1]) as reader:
	for line in reader.readlines():
	for c in line.split():
	cs.add(c)
	else:
	for line in sys.stdin.readlines():
	for c in line.split():
	cs.add(c)

	sys.exit(0 if IsUniquelyDecodable(cs) else 1)