frafra/frafra-display-levenshtein.py

## frafra-display-levenshtein.py
#!/usr/bin/env python
#
# frafra-display-levenshtein.py
#
# Copyright (C) 2014 - Francesco Frassinelli
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function
from Levenshtein import editops, apply_edit
from random import shuffle
from math import factorial
from multiprocessing import Process, Array, cpu_count
from sys import maxint

words = 'lorem ipsum dolor sit amet'.split()

def clean(plain, words):
    used = set([0])
    for word in words:
        index = 0
        for letter in word:
            index = plain.index(letter, index)
            used.add(index)
            index += 1
    return ''.join([plain[i] for i in sorted(used)])

def discover(value, iterations, lowerbound=-1):
    try:
        for index in xrange(1, iterations):
            shuffle(words)
            current = words[0]
            for word in words[1:]:
                edit = editops(current, word)
                customEdit = [('insert', s, d) for op, s, d in edit if op != 'delete']
                current = apply_edit(customEdit, current, word)
            current = clean(current, words)
            if len(current) < len(value.value):
                value.value = current[:]
                print('%s (%d)' % (value.value, len(value.value)))
            if len(value.value) <= lowerbound:
                raise KeyboardInterrupt
    except:
        pass

processes = []
value = Array('c', ''.join(words))
for i in xrange(cpu_count()):
    iterations = factorial(sum(map(len, words)))
    lowerbound = len(set(''.join(words)))+sum(map(lambda s: len(s)-len(set(s)), words))
    if iterations > maxint:
        iterations = maxint
    p = Process(target=discover, args=(value, iterations/cpu_count()+1, lowerbound))
    p.start()
    processes.append(p)
try:
    for process in processes:
        process.join()
except KeyboardInterrupt:
    pass

print()

plain = value.value

print("Original size: %d" % sum(map(len, words)))
print("Compressed: %d" % len(plain), end='\n'*2)

for word in words:
    index = 0
    for letter in plain:
        if index >= len(word):
            print(letter, end='')
        elif letter == word[index]:
            print('\033[96m%s\033[0m' % letter, end='')
            index += 1
        else:
            print(letter, end='')
    print(' (%s)' % word)
	#!/usr/bin/env python
	#
	# frafra-display-levenshtein.py
	#
	# Copyright (C) 2014 - Francesco Frassinelli
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	from __future__ import print_function
	from Levenshtein import editops, apply_edit
	from random import shuffle
	from math import factorial
	from multiprocessing import Process, Array, cpu_count
	from sys import maxint

	words = 'lorem ipsum dolor sit amet'.split()

	def clean(plain, words):
	used = set([0])
	for word in words:
	index = 0
	for letter in word:
	index = plain.index(letter, index)
	used.add(index)
	index += 1
	return ''.join([plain[i] for i in sorted(used)])

	def discover(value, iterations, lowerbound=-1):
	try:
	for index in xrange(1, iterations):
	shuffle(words)
	current = words[0]
	for word in words[1:]:
	edit = editops(current, word)
	customEdit = [('insert', s, d) for op, s, d in edit if op != 'delete']
	current = apply_edit(customEdit, current, word)
	current = clean(current, words)
	if len(current) < len(value.value):
	value.value = current[:]
	print('%s (%d)' % (value.value, len(value.value)))
	if len(value.value) <= lowerbound:
	raise KeyboardInterrupt
	except:
	pass

	processes = []
	value = Array('c', ''.join(words))
	for i in xrange(cpu_count()):
	iterations = factorial(sum(map(len, words)))
	lowerbound = len(set(''.join(words)))+sum(map(lambda s: len(s)-len(set(s)), words))
	if iterations > maxint:
	iterations = maxint
	p = Process(target=discover, args=(value, iterations/cpu_count()+1, lowerbound))
	p.start()
	processes.append(p)
	try:
	for process in processes:
	process.join()
	except KeyboardInterrupt:
	pass

	print()

	plain = value.value

	print("Original size: %d" % sum(map(len, words)))
	print("Compressed: %d" % len(plain), end='\n'*2)

	for word in words:
	index = 0
	for letter in plain:
	if index >= len(word):
	print(letter, end='')
	elif letter == word[index]:
	print('\033[96m%s\033[0m' % letter, end='')
	index += 1
	else:
	print(letter, end='')
	print(' (%s)' % word)