Skip to content

Instantly share code, notes, and snippets.

@vladiibine
Created May 24, 2019 21:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vladiibine/f01202ee77f664587b84cfb20ab22046 to your computer and use it in GitHub Desktop.
Save vladiibine/f01202ee77f664587b84cfb20ab22046 to your computer and use it in GitHub Desktop.
Trie for storing text. Very inefficient memory-wise
import copy
import unittest
class Node:
__slots__ = ["count", "next"]
def __init__(self, count, next_):
self.count = count
self.next = next_
class VladTrie:
"""Also a good name would have been VeryInefficientTrie
Stores words that share common roots more "efficiently".
At the moment, this obviously uses more memory than the text itself
It is therefore of no use! (well, maybe can serve as a fancier counter)
"""
__slots__ = ["data"]
def __init__(self, *strings, compress=False):
self.data = {"next": {}, "count":0}
# self.data = Node(0, None)
for s in strings:
self.add(s, compress=False)
if compress:
self.data = self.create_compressed_trie(self.data)
def add(self, s, node=None, compress=True):
if node:
current_node = node
else:
current_node = self.data
for index, c in enumerate(s):
if c in current_node["next"]:
current_node["next"][c]["count"] += 1
else:
current_node["next"][c] = {"count": 1, "next": {}}
current_node = current_node["next"][c]
if compress:
self.data = self.create_compressed_trie(self.data)
@classmethod
def _create_shorter_next_node(cls, char, node, parent_count):
if len(node["next"]) == 0:
return char, node
elif len(node["next"]) > 1:
new_node = {"count": node["count"], "next": {}}
for key, next_node in node["next"].items():
new_key, shorter_next_node = cls._create_shorter_next_node(
key, next_node, node["count"]
)
new_node["next"][new_key] = shorter_next_node
return char, new_node
else: # len(node["next"]) == 1
chars = [char]
cursor_node = node
while len(cursor_node["next"]) == 1:
parent_count = cursor_node["count"]
backup_node = cursor_node
new_key, cursor_node = list(cursor_node["next"].items())[0]
current_count = cursor_node["count"]
if parent_count != current_count:
# We have gone too far.
# We're in the Trie("a", "ab") situation, and need to
# allow "a" to exist as a key alone
cursor_node = backup_node
break
chars.append(new_key)
cursor_replacement_node = {"count": cursor_node["count"], "next": {}}
for key, child_node in cursor_node["next"].items():
new_key, new_child = cls._create_shorter_next_node(
key, child_node,
parent_count
)
cursor_replacement_node["next"][new_key] = new_child
return ''.join(chars), cursor_replacement_node
def create_compressed_trie(self, node=None):
if node is None:
node = self.data
_, new_node = self._create_shorter_next_node('', node, 0)
return new_node
def dump_dict(self, compress=True):
if compress:
return copy.deepcopy(self.create_compressed_trie(self.data))
else:
return copy.deepcopy(self.data)
class TestCreateCompressedTrie(unittest.TestCase):
def test_simpler_trie(self):
t = VladTrie("aa", "aabb1", "aabb2")
d = t.create_compressed_trie()
self.assertEqual({
"count": 0,
"next": {
"aa": {
"count": 3,
"next": {
"bb": {
"count": 2,
"next": {
"1": {
"count": 1,
"next": {}
},
"2": {
"count": 1,
"next": {}
}
}
},
}
}
}
}, d)
def test_second_branching(self):
t = VladTrie("aadd", "aaddx", "aaddy", "aaddzz1", "aaddzz2")
d = t.create_compressed_trie()
self.assertEqual({
"count": 0,
"next": {
"aadd": {
"count": 5,
"next": {
"x": {
"count": 1,
"next": {}
},
"y": {
"count": 1,
"next": {}
},
"zz": {
"count": 2,
"next": {
"1": {
"count": 1,
"next": {}
},
"2": {
"count": 1,
"next": {}}}}}}}
}, d)
def test_complex_trie(self):
t = VladTrie("aa", "aab", "aabc", "aabcddd", "aabcdddx", "aabcdddy", "aabcdddzz1", "aabcdddzz2")
d = t.create_compressed_trie()
self.assertEqual({
"count": 0,
"next": {
"aa": {
"count": 8,
"next": {
"b": {
"count": 7,
"next": {
"c": {
"count": 6,
"next": {
"ddd": {
"count": 5,
"next": {
"x": {
"count": 1,
"next": {}
},
"y": {
"count": 1,
"next": {}
},
"zz": {
"count": 2,
"next": {
"1": {
"count": 1,
"next": {}
},
"2": {
"count": 1,
"next": {}}}}}}}}}}}}}
}, d)
class TestCreateShorterNextNode(unittest.TestCase):
def test_single_letter_simple_shortening(self):
t = VladTrie("fo")
new_key, new_node = VladTrie._create_shorter_next_node(
"f", t.data["next"]["f"], 0)
self.assertEqual("fo", new_key)
self.assertEqual({"count": 1, "next": {}}, new_node)
def test_two_letter_simple_shortening(self):
t = VladTrie("bar")
new_key, new_node = VladTrie._create_shorter_next_node(
"b", t.data["next"]["b"], 0
)
self.assertEqual("bar", new_key)
self.assertEqual({"count": 1, "next": {}}, new_node)
def test_two_letter_shortening_then_stop(self):
t = VladTrie("qqq", "qqw")
new_key, new_node = VladTrie._create_shorter_next_node(
"q", t.data["next"]["q"], 0
)
self.assertEqual("qq", new_key)
self.assertEqual({
"count": 2,
"next": {
"q": {
"count":1,
"next":{}},
"w":{
"count":1,
"next":{}}}},
new_node
)
def test_two_chains(self):
t = VladTrie("qqqqii", "qqqqxxxzzz", "qqqqxxxttt")
new_key, new_node = VladTrie._create_shorter_next_node(
"q", t.data["next"]["q"], 0
)
self.assertEqual("qqqq", new_key)
self.assertEqual({
"count": 3,
"next": {
"ii": {
"count": 1,
"next": {}},
"xxx": {
"count": 2,
"next": {
"zzz": {
"count": 1,
"next": {},},
"ttt": {
"count": 1,
"next": {}}}},
}}, new_node)
class Tests(unittest.TestCase):
def test_add_1_string(self):
t = VladTrie("bostan")
self.assertEqual(
t.dump_dict(),
{
"count": 0,
"next":
{"bostan": {"count": 1, "next": {}}}
}
)
def test_add_2_unrelated_strings(self):
t = VladTrie("bostan", "masina")
self.assertEqual(
t.dump_dict(),
{
"count": 0,
"next":
{
"bostan": {"count": 1, "next": {}},
"masina": {"count": 1, "next": {}}
}
},
)
def test_add_2_related_strings_included_in_each_other(self):
t = VladTrie("bostan", "bostanilor")
self.assertEqual(
t.dump_dict(),
{
"count": 0,
"next":
{"bostan": {
"count": 2,
"next": {
"ilor": {
"count": 1,
"next": {}}}}}
}
)
def test_spread_one_word(self):
t = VladTrie("bar")
self.assertEqual(
t.dump_dict(False),
{
"next": {
"b": {
"count": 1,
"next": {
"a": {
"count": 1,
"next": {
"r": {
"count": 1,
"next": {}}}}}}},
"count": 0}
)
def test_spread_two_words(self):
t = VladTrie("bar", "foo")
self.assertEqual(
t.dump_dict(False),
{
"next": {
"b": {
"count": 1,
"next": {
"a": {
"count": 1,
"next": {
"r": {
"count": 1,
"next": {}}}}}},
"f": {
"count": 1,
"next": {
"o": {
"count": 1,
"next": {
"o": {
"count": 1,
"next": {}}}}}},
},
"count": 0
}
)
def test_spread_2_identical_words(self):
t = VladTrie("foo", "foo")
self.assertEqual(
t.dump_dict(compress=False),
{
"next": {
"f": {
"count": 2,
"next": {
"o": {
"count": 2,
"next": {
"o": {
"count": 2,
"next": {}}}}}},
},
"count": 0
}
)
def test_spread_2_words_with_common_roots(self):
t = VladTrie("bar", "baz")
self.assertEqual(
t.dump_dict(compress=False),
{
"next": {
"b": {
"count": 2,
"next": {
"a": {
"count": 2,
"next": {
"r": {
"count": 1,
"next": {}
},
"z": {
"count": 1,
"next": {}
}
}}}},
},
"count": 0
}
)
def test_spread_3_words_with_common_roots(self):
t = VladTrie("qqq", "qww", "qwe")
self.assertEqual(
t.dump_dict(compress=False),
{
"count": 0,
"next": {
"q": {
"count": 3,
"next": {
"q": {
"count": 1,
"next": {
"q": {
"count": 1,
"next": {},
}
}
},
"w": {
"count": 2,
"next": {
"w": {
"count": 1,
"next": {}
},
"e": {
"count": 1,
"next": {}
}
}
},
}
}
}
}
)
def main():
# add 1 string s1 to a dict D1 = {s1: d1} , where d1 = {"count": 1}
# retrieve another string s2
# compare it char by char to the string in temp_list1
# if char number 0 if the strings doesn't match, place s2 into temp_list1
# if s1 == s2, then increment d1["count"]
# if s1 has some chars at the beginning in common with s2 at the beginning, then
# remove d1 from temp_list1
# add d2 = {"str": s1[:X], "count": 2} to
# add 1 string s1 to dict d1 = {s1: {"count": 1, "}}
# retrieve a new string s2
# for each incremental slice of s2 (s2[:1], s2[:2], ...), check if such a
# string is found in the dict d1
# if such a substring is found, increment the int value of that key
# also,
# elif no such substring (or the entire string) is found, add d1[s2] = 1
#
pass
if __name__ == '__main__':
# main()
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment