Skip to content

Instantly share code, notes, and snippets.

@alexalemi
Created August 31, 2011 03:24
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save alexalemi/1182747 to your computer and use it in GitHub Desktop.
Save alexalemi/1182747 to your computer and use it in GitHub Desktop.
Twitter Compression
""" A script to attempt the compression of written english
to the chinese character set """
import os
from collections import OrderedDict
from math import log
import itertools
from collections import Counter
#grap a corpus from a text file
corpus = open( os.path.join('corpus','fullcorpus.txt') , 'r' ).read()
#Find the words in the corpus, and count them
words = corpus.split()
realwords = [w for w in words if len(w) > 1]
wordcount = Counter(realwords)
tot = sum(wordcount.values())
#Number of characters to work with 20931
def chinesechars():
""" Define the chinese character iterator """
chinesestart = 0x4E00
chinesefinish = 0x9FA5 + 30
numchars = chinesefinish - chinesestart
for i in xrange(numchars):
yield unichr(chinesestart + i )
characters = chinesechars()
# Define our translation dictionary
rep = OrderedDict()
# take the 1500 most common words
for word,count in wordcount.most_common(1500):
rep[word + ' '] = next(characters)
#replace these symbols in the corpus
newcorpus = corpus[:]
for f,t in rep.iteritems():
newcorpus = newcorpus.replace(f,t)
#Compute the entropy of a string
def entropy(sample):
""" Compute the entropy of the text,
shooting for 14.35 """
charcount = Counter(sample)
tot = float(sum(charcount.values()))
shans = sum(-x/tot * log( x/tot, 2) for x in charcount.values() )
return shans
def translate(st):
""" Translate some strange sequence back to the original """
for f in reversed(rep):
t = rep[f]
st = st.replace(t,f)
return st
def display(guys):
""" Display the results of a step """
for w,c in guys:
print ' <' + translate(w) + '>: {num} | '.format(num=c),
print
print
print
chars = Counter(newcorpus)
bigums = chars.most_common(20)
for w,c in guys:
print ' [' + translate(w) + ']: {num} | '.format(num=c),
print
print
print
def compute_seqs():
""" Compute the most common pairs and words in the source """
words = newcorpus.split()
words = [w + ' ' for w in words if len(w) > 1]
wordcounter = Counter(words)
wordcounter = wordcounter+wordcounter
#Generate pairs by tee-ing the iterator
one,two = itertools.tee(iter(newcorpus))
next(two)
pairs = itertools.izip(one,two)
pairs = ( a+b for a,b in pairs )
paircounter = Counter(pairs)
totcounter = paircounter + wordcounter
z = totcounter.most_common(100)
return totcounter
def addguys(guys):
""" Add some guys to the global translation dictionary """
global rep
for w,c in guys:
try:
rep[w] = next(characters)
except StopIteration:
print "All Done!"
break
def text_replace():
""" Translate the corpus """
global newcorpus
for f,t in rep.iteritems():
newcorpus = newcorpus.replace(f,t)
def encode(text):
""" Encode a string with the dictionary """
for f,t in rep.iteritems():
text = text.replace(f,t)
return text
def timestep():
""" Do a timestep. Find the most common pairs,
display the results
add the new pair to the dictionary
update the corpus
"""
z = compute_seqs().most_common(100)
display(z)
addguys(z)
text_replace()
if __name__ == '__main__':
#do 194 iterations
for i in xrange(194):
timestep()
print
print i
print
@alexalemi
Copy link
Author

This is most of the code used to create the twitter compression demo-ed here, the story of who's construction is detailed here

You of course need to supply a corpus text file.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment