ordonezf/decoder.py

## decoder.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import codecs
READ_BYTES = 200

def get_dic_from_file(f_name):
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	f = open("files/{name}".format(name=f_name),"r")
	limit = chr(0) + chr(0)
	read = f.read(READ_BYTES)
	while limit not in read:
		read = read + f.read(READ_BYTES)
	f.close()
	bytes_read = len(read[:read.index(limit)])
	l = read[:bytes_read].split(chr(0))
	dic_codes = {}
	for word,code in zip(l,codes):
		dic_codes[code] = word
	return dic_codes, bytes_read + 2

def decode_file(f_name, dic_codes, bytes_read):
	ascii = ["encoded_pg84.txt","encoded_pg21279.txt"]
	f = open("files/{name}".format(name=f_name),"rb")
	if f_name not in ascii:
		w = codecs.open("files/decoded_{name}".format(name=f_name),"w","utf-8-sig")
	else:
		w = open("files/decoded_{name}".format(name=f_name),"w")
	f.seek(bytes_read)
	while True:
		string = ""
		byte = f.read(1)
		if not byte:
			break
		if ord(byte) == 0:
			byte = f.read(1)
			if not byte:
				break
			string = u""
			string += unichr(ord(byte))
			try:
				w.write(string)
			except:
				w.write(string.encode(sys.stdout.encoding, errors="replace"))
			continue
		if ord(byte) in dic_codes:
			string = str(dic_codes[ord(byte)])
			try:
				w.write(string)
			except:
				w.write(repr(dic_codes[ord(byte)]))
		else:
			w.write(chr(ord(byte)))
	f.close()
	w.close()

def start_decoder():
	names = ["encoded_pg10.txt", "encoded_pg84.txt", "encoded_pg100.txt",\
	 		"encoded_pg1400.txt", "encoded_pg21279.txt", "encoded_pg22657.txt"]
	for f_name in names:
		print "Decoding file: {}...".format(f_name)
		dic_codes, bytes_read = get_dic_from_file(f_name)
		decode_file(f_name, dic_codes, bytes_read)

## encoder.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
TOP_WORDS = 158

def most_frecuent_words(f_name, nwords):
	f = open("files/{name}".format(name=f_name),"r")
	dic = {}
	for line in f:
		for word in line.split():
			dic.setdefault(word, 0)
			dic[word] += 1
	f.close()
	l = dic.items()
	l.sort(key=lambda x:x[1], reverse=True)

	return [x[0] for x in l[:TOP_WORDS+1]]

def match_frecuent_words(top_words):
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	dic = {}
	for word,code in zip(top_words, codes):
		dic[word] = code
	return dic

def encode_file(f_name, dic_codes):
	ascii = ["pg84.txt","pg21279.txt"]
	f = open("files/{name}".format(name=f_name),"r")
	w = open("files/encoded_{name}".format(name=f_name),"wb")
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	l = dic_codes.items()
	l.sort(key=lambda x:x[1])
	string = chr(0).join([x[0] for x in l])
	string = string + chr(0) + chr(0)
	string = [ord(x) for x in string]
	w.write(bytearray(string))
	i = 0
	for line in f:
		i += 1
		line_with_spaces = []
		for x in line:
			if x == " ":
				line_with_spaces.append(" ")
				line_with_spaces.append("SPACE_HERE")
			line_with_spaces.append(x)
		line_with_spaces = "".join(line_with_spaces)
		if f_name not in ascii:
			line_with_spaces = line_with_spaces.decode("utf-8-sig").encode("utf-8")
		for word in line_with_spaces.split():
			if word == "SPACE_HERE":
				w.write(bytearray([32]))
				continue
			if word in dic_codes:
				w.write(bytearray([dic_codes[word]]))
			else:
				word_in_ascii = [ord(x) for x in word]
				word_in_ascii_special = []
				for let in word_in_ascii:
					if let in codes:
						word_in_ascii_special.append(chr(0))
					word_in_ascii_special.append(let)
				w.write(bytearray(word_in_ascii_special))
		if i == 18 and f_name == 'pg84.txt':
			w.write(bytearray('\n'))
			continue
		if i == 17 and f_name == 'pg21279.txt':
			w.write(bytearray('\n'))
			continue
		w.write(bytearray('\r'))
		w.write(bytearray('\n'))
	f.close()
	w.close()

def start_encoder():
	names = ["pg10.txt", "pg84.txt", "pg100.txt", "pg1400.txt",\
			"pg21279.txt", "pg22657.txt"]
	for f_name in names:
		print "Encoding file: {}...".format(f_name)
		top_words = most_frecuent_words(f_name, TOP_WORDS)
		dic_codes = match_frecuent_words(top_words)
		encode_file(f_name, dic_codes)

## finger5.py
from encoder import start_encoder
from decoder import start_decoder
def main():
	start_encoder()
	start_decoder()
main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	import sys
	import codecs
	READ_BYTES = 200

	def get_dic_from_file(f_name):
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	f = open("files/{name}".format(name=f_name),"r")
	limit = chr(0) + chr(0)
	read = f.read(READ_BYTES)
	while limit not in read:
	read = read + f.read(READ_BYTES)
	f.close()
	bytes_read = len(read[:read.index(limit)])
	l = read[:bytes_read].split(chr(0))
	dic_codes = {}
	for word,code in zip(l,codes):
	dic_codes[code] = word
	return dic_codes, bytes_read + 2

	def decode_file(f_name, dic_codes, bytes_read):
	ascii = ["encoded_pg84.txt","encoded_pg21279.txt"]
	f = open("files/{name}".format(name=f_name),"rb")
	if f_name not in ascii:
	w = codecs.open("files/decoded_{name}".format(name=f_name),"w","utf-8-sig")
	else:
	w = open("files/decoded_{name}".format(name=f_name),"w")
	f.seek(bytes_read)
	while True:
	string = ""
	byte = f.read(1)
	if not byte:
	break
	if ord(byte) == 0:
	byte = f.read(1)
	if not byte:
	break
	string = u""
	string += unichr(ord(byte))
	try:
	w.write(string)
	except:
	w.write(string.encode(sys.stdout.encoding, errors="replace"))
	continue
	if ord(byte) in dic_codes:
	string = str(dic_codes[ord(byte)])
	try:
	w.write(string)
	except:
	w.write(repr(dic_codes[ord(byte)]))
	else:
	w.write(chr(ord(byte)))
	f.close()
	w.close()

	def start_decoder():
	names = ["encoded_pg10.txt", "encoded_pg84.txt", "encoded_pg100.txt",\
	"encoded_pg1400.txt", "encoded_pg21279.txt", "encoded_pg22657.txt"]
	for f_name in names:
	print "Decoding file: {}...".format(f_name)
	dic_codes, bytes_read = get_dic_from_file(f_name)
	decode_file(f_name, dic_codes, bytes_read)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	TOP_WORDS = 158

	def most_frecuent_words(f_name, nwords):
	f = open("files/{name}".format(name=f_name),"r")
	dic = {}
	for line in f:
	for word in line.split():
	dic.setdefault(word, 0)
	dic[word] += 1
	f.close()
	l = dic.items()
	l.sort(key=lambda x:x[1], reverse=True)

	return [x[0] for x in l[:TOP_WORDS+1]]

	def match_frecuent_words(top_words):
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	dic = {}
	for word,code in zip(top_words, codes):
	dic[word] = code
	return dic

	def encode_file(f_name, dic_codes):
	ascii = ["pg84.txt","pg21279.txt"]
	f = open("files/{name}".format(name=f_name),"r")
	w = open("files/encoded_{name}".format(name=f_name),"wb")
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	l = dic_codes.items()
	l.sort(key=lambda x:x[1])
	string = chr(0).join([x[0] for x in l])
	string = string + chr(0) + chr(0)
	string = [ord(x) for x in string]
	w.write(bytearray(string))
	i = 0
	for line in f:
	i += 1
	line_with_spaces = []
	for x in line:
	if x == " ":
	line_with_spaces.append(" ")
	line_with_spaces.append("SPACE_HERE")
	line_with_spaces.append(x)
	line_with_spaces = "".join(line_with_spaces)
	if f_name not in ascii:
	line_with_spaces = line_with_spaces.decode("utf-8-sig").encode("utf-8")
	for word in line_with_spaces.split():
	if word == "SPACE_HERE":
	w.write(bytearray([32]))
	continue
	if word in dic_codes:
	w.write(bytearray([dic_codes[word]]))
	else:
	word_in_ascii = [ord(x) for x in word]
	word_in_ascii_special = []
	for let in word_in_ascii:
	if let in codes:
	word_in_ascii_special.append(chr(0))
	word_in_ascii_special.append(let)
	w.write(bytearray(word_in_ascii_special))
	if i == 18 and f_name == 'pg84.txt':
	w.write(bytearray('\n'))
	continue
	if i == 17 and f_name == 'pg21279.txt':
	w.write(bytearray('\n'))
	continue
	w.write(bytearray('\r'))
	w.write(bytearray('\n'))
	f.close()
	w.close()

	def start_encoder():
	names = ["pg10.txt", "pg84.txt", "pg100.txt", "pg1400.txt",\
	"pg21279.txt", "pg22657.txt"]
	for f_name in names:
	print "Encoding file: {}...".format(f_name)
	top_words = most_frecuent_words(f_name, TOP_WORDS)
	dic_codes = match_frecuent_words(top_words)
	encode_file(f_name, dic_codes)
	from encoder import start_encoder
	from decoder import start_decoder
	def main():
	start_encoder()
	start_decoder()
	main()