Skip to content

Instantly share code, notes, and snippets.

@yunake
Created May 22, 2014 16:58
Show Gist options
  • Save yunake/c3ef87455c84b9ac9513 to your computer and use it in GitHub Desktop.
Save yunake/c3ef87455c84b9ac9513 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This module provides two functions, transform() & untransform(), and the second
one should reverse the output of the first one. Since the transformation is
lossy, just provide a list of strings that satisfy the conditions, any one of
them might've been the original source and there is no way to know for sure.
The transformation is as follows: assume an English-language string as input,
in every word shuffle the letters in random order, then remove all whitespace
to create one long continuous string that looks like abracadabra, e.g.:
"a good cat" -> "adogoatc" or "aoodgcta"
If run directly from the terminal, execute the tests.
Python 2.5+ and 3.0+ compatible.
Copyright (c) 2014, Eugene Yunak <eugene@yunak.eu>
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from __future__ import with_statement
from random import shuffle
from itertools import product
import six
def transform(string):
"""
Perform the transformation described above, on the input string.
Strip non-alpha characters, make lowercase.
Return resulting string.
"""
# String needs to be all lowercase and alpha+whitespace only.
string = string.lower()
string = ''.join(filter(lambda c: str.isalpha(c) or str.isspace(c), string))
words_in = string.split()
words_out = list()
for word in words_in:
# random.shuffle only works on a list, does the transformations in-place
# and does not return it, so we need to convert to a temporary list
word = list(word)
shuffle(word)
words_out.append(''.join(word))
return ''.join(words_out)
def untransform(string):
"""
Reverse the transformation described above.
Input is a string processed via transform().
Assume English language, no non-alpha characters, lowercase.
Use a hard-coded system-wide dictionary as a list of English words.
Return a list of all possible strings that satisfy the transformation.
"""
# This function defines a number of helper functions and then just runs
# `get_words()` as the starting point. This is used just to isolate the
# namespace, the functions aren't real closures since they don't reference
# nonlocal variables, in fact this function does not have any.
def isword(letters, words={}):
"""
Checks if the given combination of letters can form a word.
Returns a list of all possible words, otherwise None.
"""
# Populate the words dict once. Since `words` is a default parameter,
# so it's initialised on definition, and since it's a mutable object
# and I use update() to change it in place, the changes persist between
# executions.
if not words:
words.update(make_words())
try:
return words[sort_word(letters)]
except KeyError:
return False
def sort_word(letters):
"""
Return a string of aphabetically sorted letters. E.g.:
'navi' -> 'ainv', 'dendi' -> 'ddein'
"""
return ''.join(sorted(letters))
def make_words():
"""
Populate the word list from a hardcoded systemwide list.
Store as dict, key is the word letters sorted alphabetically,
value is a list of all words that can be made out of same letters.
"""
DICTFILE = "/usr/share/dict/words"
worddict = {}
with open(DICTFILE, 'r') as wordfile:
for word in wordfile:
# Skip erroneous entries, the dict has extra entries for plural.
# Lots of junk in there, have to ignore short words
# because it has pretty much all letter combos <= 2 letters.
if word[-2:] == "'s": # or len(word) <= 2: # temp turn off
continue
# Word needs to be lowercase and alpha.
word = word.lower()
word = ''.join(filter(str.isalpha, word))
key = sort_word(word)
try:
worddict[key]
except KeyError:
worddict[key] = []
if word not in worddict[key]:
worddict[key].append(word)
return worddict
def string_product(*args):
"""
Combine elements of all supplied lists into a list of strings,
with elements separated by space, covering all permutations. E.g.:
['a', 'b'], ['x', 'y'], ['f', 'h'] ->
[ 'a x f', 'a x h', 'a y f', 'a y h',
'b x f', 'b x h', 'b y f', 'b y h' ]
"""
return [' '.join(i) for i in product(*args)]
def get_words(string):
"""
Recursive function that accepts string as input and tries to split
it into English words, assumes the transformation described above.
Returns a list of strings, each string represents
one possible combination of words.
Returns an empty list of no possible combinations were found.
"""
output = []
# Go through slices out of the string, one more character at a time,
# and see if there are words that can be formed out of these letters.
for index in range(len(string)):
found_words = isword(string[:index+1])
# If there are no words out of these letters, just move on and add
# one more letter to the slice on the next cycle.
if found_words:
# Is there more string to process once we cut off our word?
if string[index+1:]:
# Process the rest of the string.
rest_of_words = get_words(string[index+1:])
if rest_of_words:
output += string_product(found_words, rest_of_words)
# If there is more string to process but no words can be
# fitted in there, then the word we found does not fit us,
# just move on to the next, bigger, slice.
# If we got a slice that equals the lenght of the string, and
# we found words that fit that slice, they are our only result.
else:
output += found_words
# If we didn't find any combination of words and rest_of_words to fit
# the whole lenght of the string, the output list will remain empty.
return output
# Run the motherfucker!
return get_words(string)
if __name__ == '__main__':
# Simply run the tests
from string import ascii_letters
from unittest import TestCase
from unittest import main as tests_main
class TestTransformations(TestCase):
def setUp(self):
self.strings = [
"mark these words",
"you love this fluffy cat",
"special characters",
]
# TODO: non-alpha, mixed case, non-english, corner-cases
self.inputs = {}
for str_plain in self.strings:
str_transformed = transform(str_plain)
self.inputs[str_transformed] = str_plain
def test_transform(self):
for str_in in self.inputs:
plain = list(self.inputs[str_in])
for char in str_in:
plain.remove(char)
# TODO: need to also check that the order is preserved in words
for char in plain:
self.assertNotIn(char, ascii_letters)
def test_untransform(self):
for str_in in self.inputs:
self.assertIn(self.inputs[str_in], untransform(str_in))
tests_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment