Skip to content

Instantly share code, notes, and snippets.

@ordonezf
Last active April 4, 2016 23:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ordonezf/f653aea2f3242704ee5d0bba1f682c29 to your computer and use it in GitHub Desktop.
Save ordonezf/f653aea2f3242704ee5d0bba1f682c29 to your computer and use it in GitHub Desktop.
Finger 5
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import codecs
READ_BYTES = 200
def get_dic_from_file(f_name):
codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
f = open("files/{name}".format(name=f_name),"r")
limit = chr(0) + chr(0)
read = f.read(READ_BYTES)
while limit not in read:
read = read + f.read(READ_BYTES)
f.close()
bytes_read = len(read[:read.index(limit)])
l = read[:bytes_read].split(chr(0))
dic_codes = {}
for word,code in zip(l,codes):
dic_codes[code] = word
return dic_codes, bytes_read + 2
def decode_file(f_name, dic_codes, bytes_read):
ascii = ["encoded_pg84.txt","encoded_pg21279.txt"]
f = open("files/{name}".format(name=f_name),"rb")
if f_name not in ascii:
w = codecs.open("files/decoded_{name}".format(name=f_name),"w","utf-8-sig")
else:
w = open("files/decoded_{name}".format(name=f_name),"w")
f.seek(bytes_read)
while True:
string = ""
byte = f.read(1)
if not byte:
break
if ord(byte) == 0:
byte = f.read(1)
if not byte:
break
string = u""
string += unichr(ord(byte))
try:
w.write(string)
except:
w.write(string.encode(sys.stdout.encoding, errors="replace"))
continue
if ord(byte) in dic_codes:
string = str(dic_codes[ord(byte)])
try:
w.write(string)
except:
w.write(repr(dic_codes[ord(byte)]))
else:
w.write(chr(ord(byte)))
f.close()
w.close()
def start_decoder():
names = ["encoded_pg10.txt", "encoded_pg84.txt", "encoded_pg100.txt",\
"encoded_pg1400.txt", "encoded_pg21279.txt", "encoded_pg22657.txt"]
for f_name in names:
print "Decoding file: {}...".format(f_name)
dic_codes, bytes_read = get_dic_from_file(f_name)
decode_file(f_name, dic_codes, bytes_read)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
TOP_WORDS = 158
def most_frecuent_words(f_name, nwords):
f = open("files/{name}".format(name=f_name),"r")
dic = {}
for line in f:
for word in line.split():
dic.setdefault(word, 0)
dic[word] += 1
f.close()
l = dic.items()
l.sort(key=lambda x:x[1], reverse=True)
return [x[0] for x in l[:TOP_WORDS+1]]
def match_frecuent_words(top_words):
codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
dic = {}
for word,code in zip(top_words, codes):
dic[word] = code
return dic
def encode_file(f_name, dic_codes):
ascii = ["pg84.txt","pg21279.txt"]
f = open("files/{name}".format(name=f_name),"r")
w = open("files/encoded_{name}".format(name=f_name),"wb")
codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
l = dic_codes.items()
l.sort(key=lambda x:x[1])
string = chr(0).join([x[0] for x in l])
string = string + chr(0) + chr(0)
string = [ord(x) for x in string]
w.write(bytearray(string))
i = 0
for line in f:
i += 1
line_with_spaces = []
for x in line:
if x == " ":
line_with_spaces.append(" ")
line_with_spaces.append("SPACE_HERE")
line_with_spaces.append(x)
line_with_spaces = "".join(line_with_spaces)
if f_name not in ascii:
line_with_spaces = line_with_spaces.decode("utf-8-sig").encode("utf-8")
for word in line_with_spaces.split():
if word == "SPACE_HERE":
w.write(bytearray([32]))
continue
if word in dic_codes:
w.write(bytearray([dic_codes[word]]))
else:
word_in_ascii = [ord(x) for x in word]
word_in_ascii_special = []
for let in word_in_ascii:
if let in codes:
word_in_ascii_special.append(chr(0))
word_in_ascii_special.append(let)
w.write(bytearray(word_in_ascii_special))
if i == 18 and f_name == 'pg84.txt':
w.write(bytearray('\n'))
continue
if i == 17 and f_name == 'pg21279.txt':
w.write(bytearray('\n'))
continue
w.write(bytearray('\r'))
w.write(bytearray('\n'))
f.close()
w.close()
def start_encoder():
names = ["pg10.txt", "pg84.txt", "pg100.txt", "pg1400.txt",\
"pg21279.txt", "pg22657.txt"]
for f_name in names:
print "Encoding file: {}...".format(f_name)
top_words = most_frecuent_words(f_name, TOP_WORDS)
dic_codes = match_frecuent_words(top_words)
encode_file(f_name, dic_codes)
from encoder import start_encoder
from decoder import start_decoder
def main():
start_encoder()
start_decoder()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment