Skip to content

Instantly share code, notes, and snippets.

@erkyrath
Created December 21, 2020 20:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erkyrath/0e09cc8a2bf16c1238fd87d85158ef7f to your computer and use it in GitHub Desktop.
Save erkyrath/0e09cc8a2bf16c1238fd87d85158ef7f to your computer and use it in GitHub Desktop.
Trying to implement the suffix-tree algorithm for finding abbreviations
# This uses the suffix-tree library (https://pypi.org/project/suffix-tree/).
# pip3 install suffix-tree
import sys
from suffix_tree import Tree
from suffix_tree.node import Internal, Leaf
from suffix_tree.util import UniqueEndChar
if len(sys.argv) <= 1:
print('usage: work.py game.txt')
sys.exit(-1)
# Read the list of strings.
fl = open(sys.argv[1])
stringlist = fl.read().split('\n')
fl.close()
# Remove blank lines.
stringlist = [ val for val in stringlist if val ]
def pathstr(nod):
path = nod.path
ls = path.S[path.start:path.end]
return ''.join([ str(val) for val in ls if type(val) is not UniqueEndChar ])
def find_best_abbrev(stringlist):
ils = [ ('s%d' % (ix,), val) for ix, val in enumerate(stringlist) ]
tree = Tree(dict(ils))
best = [None, None, None] # savings, count, node
traverse(tree.root, (), tree, best)
abbrev = pathstr(best[2])
#print('best: %d, %d: %r' % (best[0], best[1], abbrev,))
if best[0] <= 0:
# Savings is zero or negative -- give up.
return '', 0, 0
return abbrev, best[0], best[1]
def traverse(nod, prefix, tree, best):
abbrev = pathstr(nod)
count = len(tree.find_all(abbrev))
cost = len(abbrev)
savings = count * (cost-2) - cost
if best[0] is None or savings > best[0]:
best[0] = savings
best[1] = count
best[2] = nod
#prefstr = ''.join([ str(val) for val in prefix ])
#print('abbrev=%r, prefstr=%r, count=%d, save=%d: %s(%s)' % (prefstr, abbrev, count, savings, type(nod).__name__, nod))
if type(nod) is Internal:
for ch, nod in nod.children.items():
traverse(nod, prefix+(ch,), tree, best)
abbrevs = []
origlen = sum([ len(val) for val in stringlist ])
totalcount = 0
while len(abbrevs) < 96:
abbrev, savings, count = find_best_abbrev(stringlist)
if not abbrev:
break
print('%r (count %d, savings %d)' % (abbrev, count, savings))
abbrevs.append(abbrev)
totalcount += count
# Create a new string list by breaking existing strings on the new abbreviation.
ls = []
for val in stringlist:
subls = val.split(abbrev)
ls.extend(subls)
stringlist = [ val for val in ls if val ]
finallen = sum([ len(val) for val in stringlist ])
print('Reduced from %d char to %d using %d abbrevs; saved %d' % (origlen, finallen, totalcount, origlen-(finallen+2*totalcount)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment