yunake/transform_from_google.py

## transform_from_google.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""

This module provides two functions, transform() & untransform(), and the second
one should reverse the output of the first one. Since the transformation is
lossy, just provide a list of strings that satisfy the conditions, any one of
them might've been the original source and there is no way to know for sure.

The transformation is as follows: assume an English-language string as input,
in every word shuffle the letters in random order, then remove all whitespace
to create one long continuous string that looks like abracadabra, e.g.:
    "a good cat" -> "adogoatc" or "aoodgcta"


If run directly from the terminal, execute the tests.
Python 2.5+ and 3.0+ compatible.


Copyright (c) 2014, Eugene Yunak <eugene@yunak.eu>

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""


from __future__ import with_statement

from random     import shuffle
from itertools  import product

import six


def transform(string):
    """
        Perform the transformation described above, on the input string.
        Strip non-alpha characters, make lowercase.
        Return resulting string.
    """

    # String needs to be all lowercase and alpha+whitespace only.
    string = string.lower()
    string = ''.join(filter(lambda c: str.isalpha(c) or str.isspace(c), string))

    words_in = string.split()
    words_out = list()

    for word in words_in:
        # random.shuffle only works on a list, does the transformations in-place
        # and does not return it, so we need to convert to a temporary list
        word = list(word)
        shuffle(word)
        words_out.append(''.join(word))

    return ''.join(words_out)


def untransform(string):
    """
        Reverse the transformation described above.
        Input is a string processed via transform().
        Assume English language, no non-alpha characters, lowercase.
        Use a hard-coded system-wide dictionary as a list of English words.
        Return a list of all possible strings that satisfy the transformation.
    """

    # This function defines a number of helper functions and then just runs
    # `get_words()` as the starting point. This is used just to isolate the
    # namespace, the functions aren't real closures since they don't reference
    # nonlocal variables, in fact this function does not have any.


    def isword(letters, words={}):
        """
            Checks if the given combination of letters can form a word.
            Returns a list of all possible words, otherwise None.
        """

        # Populate the words dict once. Since `words` is a default parameter,
        # so it's initialised on definition, and since it's a mutable object
        # and I use update() to change it in place, the changes persist between
        # executions.
        if not words:
            words.update(make_words())

        try:
            return words[sort_word(letters)]
        except KeyError:
            return False


    def sort_word(letters):
        """
            Return a string of aphabetically sorted letters. E.g.:
            'navi' -> 'ainv', 'dendi' -> 'ddein'
        """
        return ''.join(sorted(letters))


    def make_words():
        """
            Populate the word list from a hardcoded systemwide list.
            Store as dict, key is the word letters sorted alphabetically,
            value is a list of all words that can be made out of same letters.
        """

        DICTFILE = "/usr/share/dict/words"
        worddict = {}

        with open(DICTFILE, 'r') as wordfile:

            for word in wordfile:
                # Skip erroneous entries, the dict has extra entries for plural.
                # Lots of junk in there, have to ignore short words
                # because it has pretty much all letter combos <= 2 letters.
                if word[-2:] == "'s": # or len(word) <= 2: # temp turn off
                    continue

                # Word needs to be lowercase and alpha.
                word = word.lower()
                word = ''.join(filter(str.isalpha, word))

                key = sort_word(word)

                try:
                    worddict[key]
                except KeyError:
                    worddict[key] = []

                if word not in worddict[key]:
                        worddict[key].append(word)

        return worddict


    def string_product(*args):
        """
            Combine elements of all supplied lists into a list of strings,
            with elements separated by space, covering all permutations. E.g.:
                ['a', 'b'], ['x', 'y'], ['f', 'h'] ->
                [ 'a x f', 'a x h', 'a y f', 'a y h',
                  'b x f', 'b x h', 'b y f', 'b y h' ]
        """
        return [' '.join(i) for i in product(*args)]


    def get_words(string):
        """
            Recursive function that accepts string as input and tries to split
            it into English words, assumes the transformation described above.
            Returns a list of strings, each string represents
            one possible combination of words.
            Returns an empty list of no possible combinations were found.
        """

        output = []

        # Go through slices out of the string, one more character at a time,
        # and see if there are words that can be formed out of these letters.
        for index in range(len(string)):
            found_words = isword(string[:index+1])

            # If there are no words out of these letters, just move on and add
            # one more letter to the slice on the next cycle.
            if found_words:

                # Is there more string to process once we cut off our word?
                if string[index+1:]:

                    # Process the rest of the string.
                    rest_of_words = get_words(string[index+1:])

                    if rest_of_words:
                        output += string_product(found_words, rest_of_words)
                    # If there is more string to process but no words can be
                    # fitted in there, then the word we found does not fit us,
                    # just move on to the next, bigger, slice.

                # If we got a slice that equals the lenght of the string, and
                # we found words that fit that slice, they are our only result.
                else:
                    output += found_words

        # If we didn't find any combination of words and rest_of_words to fit
        # the whole lenght of the string, the output list will remain empty.
        return output


    # Run the motherfucker!
    return get_words(string)


if __name__ == '__main__':

    # Simply run the tests

    from string   import ascii_letters
    from unittest import TestCase
    from unittest import main as tests_main

    class TestTransformations(TestCase):
        def setUp(self):
            self.strings = [
                             "mark these words",
                             "you love this fluffy cat",
                             "special characters",
                           ]
            # TODO: non-alpha, mixed case, non-english, corner-cases
            self.inputs = {}

            for str_plain in self.strings:
                str_transformed = transform(str_plain)
                self.inputs[str_transformed] = str_plain

        def test_transform(self):
            for str_in in self.inputs:
                plain = list(self.inputs[str_in])

                for char in str_in:
                    plain.remove(char)

                # TODO: need to also check that the order is preserved in words
                for char in plain:
                    self.assertNotIn(char, ascii_letters)

        def test_untransform(self):
            for str_in in self.inputs:
                self.assertIn(self.inputs[str_in], untransform(str_in))


    tests_main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""

	This module provides two functions, transform() & untransform(), and the second
	one should reverse the output of the first one. Since the transformation is
	lossy, just provide a list of strings that satisfy the conditions, any one of
	them might've been the original source and there is no way to know for sure.

	The transformation is as follows: assume an English-language string as input,
	in every word shuffle the letters in random order, then remove all whitespace
	to create one long continuous string that looks like abracadabra, e.g.:
	"a good cat" -> "adogoatc" or "aoodgcta"


	If run directly from the terminal, execute the tests.
	Python 2.5+ and 3.0+ compatible.


	Copyright (c) 2014, Eugene Yunak <eugene@yunak.eu>

	All rights reserved.

	Redistribution and use in source and binary forms, with or without modification,
	are permitted provided that the following conditions are met:

	1. Redistributions of source code must retain the above copyright notice, this
	list of conditions and the following disclaimer.

	2. Redistributions in binary form must reproduce the above copyright notice,
	this list of conditions and the following disclaimer in the documentation and/or
	other materials provided with the distribution.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
	ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
	ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	"""


	from __future__ import with_statement

	from random import shuffle
	from itertools import product

	import six


	def transform(string):
	"""
	Perform the transformation described above, on the input string.
	Strip non-alpha characters, make lowercase.
	Return resulting string.
	"""

	# String needs to be all lowercase and alpha+whitespace only.
	string = string.lower()
	string = ''.join(filter(lambda c: str.isalpha(c) or str.isspace(c), string))

	words_in = string.split()
	words_out = list()

	for word in words_in:
	# random.shuffle only works on a list, does the transformations in-place
	# and does not return it, so we need to convert to a temporary list
	word = list(word)
	shuffle(word)
	words_out.append(''.join(word))

	return ''.join(words_out)


	def untransform(string):
	"""
	Reverse the transformation described above.
	Input is a string processed via transform().
	Assume English language, no non-alpha characters, lowercase.
	Use a hard-coded system-wide dictionary as a list of English words.
	Return a list of all possible strings that satisfy the transformation.
	"""

	# This function defines a number of helper functions and then just runs
	# `get_words()` as the starting point. This is used just to isolate the
	# namespace, the functions aren't real closures since they don't reference
	# nonlocal variables, in fact this function does not have any.


	def isword(letters, words={}):
	"""
	Checks if the given combination of letters can form a word.
	Returns a list of all possible words, otherwise None.
	"""

	# Populate the words dict once. Since `words` is a default parameter,
	# so it's initialised on definition, and since it's a mutable object
	# and I use update() to change it in place, the changes persist between
	# executions.
	if not words:
	words.update(make_words())

	try:
	return words[sort_word(letters)]
	except KeyError:
	return False


	def sort_word(letters):
	"""
	Return a string of aphabetically sorted letters. E.g.:
	'navi' -> 'ainv', 'dendi' -> 'ddein'
	"""
	return ''.join(sorted(letters))


	def make_words():
	"""
	Populate the word list from a hardcoded systemwide list.
	Store as dict, key is the word letters sorted alphabetically,
	value is a list of all words that can be made out of same letters.
	"""

	DICTFILE = "/usr/share/dict/words"
	worddict = {}

	with open(DICTFILE, 'r') as wordfile:

	for word in wordfile:
	# Skip erroneous entries, the dict has extra entries for plural.
	# Lots of junk in there, have to ignore short words
	# because it has pretty much all letter combos <= 2 letters.
	if word[-2:] == "'s": # or len(word) <= 2: # temp turn off
	continue

	# Word needs to be lowercase and alpha.
	word = word.lower()
	word = ''.join(filter(str.isalpha, word))

	key = sort_word(word)

	try:
	worddict[key]
	except KeyError:
	worddict[key] = []

	if word not in worddict[key]:
	worddict[key].append(word)

	return worddict


	def string_product(*args):
	"""
	Combine elements of all supplied lists into a list of strings,
	with elements separated by space, covering all permutations. E.g.:
	['a', 'b'], ['x', 'y'], ['f', 'h'] ->
	[ 'a x f', 'a x h', 'a y f', 'a y h',
	'b x f', 'b x h', 'b y f', 'b y h' ]
	"""
	return [' '.join(i) for i in product(*args)]


	def get_words(string):
	"""
	Recursive function that accepts string as input and tries to split
	it into English words, assumes the transformation described above.
	Returns a list of strings, each string represents
	one possible combination of words.
	Returns an empty list of no possible combinations were found.
	"""

	output = []

	# Go through slices out of the string, one more character at a time,
	# and see if there are words that can be formed out of these letters.
	for index in range(len(string)):
	found_words = isword(string[:index+1])

	# If there are no words out of these letters, just move on and add
	# one more letter to the slice on the next cycle.
	if found_words:

	# Is there more string to process once we cut off our word?
	if string[index+1:]:

	# Process the rest of the string.
	rest_of_words = get_words(string[index+1:])

	if rest_of_words:
	output += string_product(found_words, rest_of_words)
	# If there is more string to process but no words can be
	# fitted in there, then the word we found does not fit us,
	# just move on to the next, bigger, slice.

	# If we got a slice that equals the lenght of the string, and
	# we found words that fit that slice, they are our only result.
	else:
	output += found_words

	# If we didn't find any combination of words and rest_of_words to fit
	# the whole lenght of the string, the output list will remain empty.
	return output


	# Run the motherfucker!
	return get_words(string)


	if __name__ == '__main__':

	# Simply run the tests

	from string import ascii_letters
	from unittest import TestCase
	from unittest import main as tests_main

	class TestTransformations(TestCase):
	def setUp(self):
	self.strings = [
	"mark these words",
	"you love this fluffy cat",
	"special characters",
	]
	# TODO: non-alpha, mixed case, non-english, corner-cases
	self.inputs = {}

	for str_plain in self.strings:
	str_transformed = transform(str_plain)
	self.inputs[str_transformed] = str_plain

	def test_transform(self):
	for str_in in self.inputs:
	plain = list(self.inputs[str_in])

	for char in str_in:
	plain.remove(char)

	# TODO: need to also check that the order is preserved in words
	for char in plain:
	self.assertNotIn(char, ascii_letters)

	def test_untransform(self):
	for str_in in self.inputs:
	self.assertIn(self.inputs[str_in], untransform(str_in))


	tests_main()