Skip to content

Instantly share code, notes, and snippets.

@braska
Created November 13, 2017 00:28
Show Gist options
  • Save braska/fc45a300adc2466fae32c4b256f008a8 to your computer and use it in GitHub Desktop.
Save braska/fc45a300adc2466fae32c4b256f008a8 to your computer and use it in GitHub Desktop.
Huffman text encoder/decoder
package main
import (
"flag"
"fmt"
"os"
"bufio"
"math"
"bytes"
"encoding/gob"
"github.com/olekukonko/tablewriter"
"github.com/dustin/go-humanize"
)
func usage() {
fmt.Fprintf(os.Stderr, "usage: %s [command] [inputfile] [outputfile]\n", os.Args[0])
flag.PrintDefaults()
os.Exit(2)
}
func check(e error) {
if e != nil {
panic(e)
}
}
type EncodingMode int
const (
MODE_ONE EncodingMode = iota
MODE_TWO EncodingMode = iota
)
type huffmanCode struct {
Code uint32
CodeLen uint8
}
func (hcode huffmanCode) asString() string {
var buffer bytes.Buffer
for i := hcode.CodeLen; i > 0; i-- {
n := hcode.Code & (1 << (i - 1))
var t string
if n > 0 {
t = "1"
} else {
t = "0"
}
buffer.WriteString(t)
}
return buffer.String()
}
type huffmanCodes map[string]*huffmanCode
func findMinKeyInMap(m map[string]int) (k string) {
var minValue int
isFirstIter := true
for key, value := range m {
if isFirstIter {
minValue = value
k = key
isFirstIter = false
}
if value < minValue {
minValue = value
k = key
}
}
return
}
func encode(mode EncodingMode, inputf *os.File, outputf *os.File) {
table := tablewriter.NewWriter(os.Stdout)
countRunes := 0
vertices := make(map[string]int)
combinations := make(map[string]int)
var prevRune string
scanner := bufio.NewScanner(inputf)
scanner.Split(bufio.ScanRunes)
for scanner.Scan() {
s := scanner.Text()
vertices[s] += 1
countRunes += 1
if prevRune != "" {
combination := prevRune + s
combinations[combination] += 1
}
prevRune = s
}
countCombinations := countRunes - 1
table.Append([]string{"Runes", fmt.Sprint(countRunes)})
table.Append([]string{"Alphabet power", fmt.Sprint(len(vertices))})
entropy := 0.0
for _, value := range vertices {
p_i := float64(value) / float64(countRunes)
entropy -= p_i * math.Log2(p_i)
}
table.Append([]string{"Entropy", fmt.Sprint(entropy)})
conditionalEntropy := 0.0
for s1, count1 := range vertices {
p_i := float64(count1) / float64(countRunes)
for s2 := range vertices {
countForCombination, ok := combinations[s1 + s2]
if ok {
p_i_j := float64(countForCombination) / float64(countCombinations)
conditionalEntropy -= p_i_j * math.Log2(p_i_j / p_i)
}
}
}
table.Append([]string{"Conditional entropy", fmt.Sprint(conditionalEntropy)})
tableOfCodes := tablewriter.NewWriter(os.Stdout)
codes := make(huffmanCodes)
countBitsForAllText := 0
var huffmanTreeVertices map[string]int
if mode == MODE_ONE {
huffmanTreeVertices = vertices
} else {
huffmanTreeVertices = combinations
}
for len(huffmanTreeVertices) > 1 {
key1 := findMinKeyInMap(huffmanTreeVertices)
value1 := huffmanTreeVertices[key1]
delete(huffmanTreeVertices, key1)
key2 := findMinKeyInMap(huffmanTreeVertices)
value2 := huffmanTreeVertices[key2]
delete(huffmanTreeVertices, key2)
// if this is just leaf in huffman tree (single rune)
if ((mode == MODE_ONE && len([]rune(key1)) == 1) || (mode == MODE_TWO && len([]rune(key1)) == 2)) {
// key1 is a string from left branch => code=0
codes[key1] = &huffmanCode{Code: 0, CodeLen: 1}
countBitsForAllText += value1
} else {
if mode == MODE_ONE {
// for all runes from key1 add ZERO as a prefix for code
// adding zero as a prefix is the same as increment codeLen
for _, r := range key1 {
codes[string(r)].CodeLen += 1
}
} else {
var prevSym string
for _, r := range key1 {
if prevSym != "" {
combination := prevSym + string(r)
codes[combination].CodeLen += 1
prevSym = ""
} else {
prevSym = string(r)
}
}
}
}
if ((mode == MODE_ONE && len([]rune(key2)) == 1) || (mode == MODE_TWO && len([]rune(key2)) == 2)) {
codes[key2] = &huffmanCode{Code: 1, CodeLen: 1}
countBitsForAllText += value2
} else {
if mode == MODE_ONE {
for _, r := range key2 {
k := string(r)
// set bit to 1 at specific position
codes[k].Code |= (1 << codes[k].CodeLen)
codes[k].CodeLen += 1
}
} else {
var prevSym string
for _, r := range key2 {
if prevSym != "" {
combination := prevSym + string(r)
// set bit to 1 at specific position
codes[combination].Code |= (1 << codes[combination].CodeLen)
codes[combination].CodeLen += 1
prevSym = ""
} else {
prevSym = string(r)
}
}
}
}
huffmanTreeVertices[key1 + key2] = value1 + value2
}
enc := gob.NewEncoder(outputf)
err := enc.Encode(codes)
check(err)
err = enc.Encode(countBitsForAllText)
check(err)
for r, code := range codes {
tableOfCodes.Append([]string{string(r), fmt.Sprint(code.asString())})
}
inputf.Seek(0, 0)
scanner2 := bufio.NewScanner(inputf)
scanner2.Split(bufio.ScanRunes)
var tmpByte byte
var bitsInTmpByte uint8 = 0
var prevSym string
for scanner2.Scan() {
s := scanner2.Text()
if mode == MODE_TWO {
if prevSym == "" {
prevSym = s
continue
}
s = prevSym + s
prevSym = ""
}
hcode := *codes[s]
for hcode.CodeLen > 0 {
shift := int(hcode.CodeLen) - (8 - int(bitsInTmpByte))
var bitsMustBeWritenToTmpByte uint8 = 0
if shift >= 0 {
tmpByte += byte((hcode.Code >> uint(shift)) & 0xff)
bitsMustBeWritenToTmpByte = 8 - bitsInTmpByte
bitsInTmpByte = 8
} else if shift < 0 {
tmpByte += byte((hcode.Code << uint(shift * -1)) & 0xff)
bitsMustBeWritenToTmpByte = hcode.CodeLen
bitsInTmpByte += hcode.CodeLen
}
if shift <= 0 {
hcode.CodeLen = 0
} else {
hcode.CodeLen -= bitsMustBeWritenToTmpByte
}
if bitsInTmpByte == 8 {
_, err = outputf.Write([]byte{tmpByte})
check(err)
bitsInTmpByte = 0
tmpByte = 0
}
}
}
if bitsInTmpByte > 0 {
_, err = outputf.Write([]byte{tmpByte})
check(err)
bitsInTmpByte = 0
tmpByte = 0
}
outputf.Sync()
tableOfStats := tablewriter.NewWriter(os.Stdout)
inputfStat, istatErr := inputf.Stat()
check(istatErr)
outputfStat, ostatErr := outputf.Stat()
check(ostatErr)
inputfSize := inputfStat.Size()
outputfSize := outputfStat.Size()
tableOfStats.Append([]string{"Input file size", humanize.Bytes(uint64(inputfSize))})
tableOfStats.Append([]string{"Output file size", humanize.Bytes(uint64(outputfSize))})
tableOfStats.Append([]string{"Bits per rune in input file", fmt.Sprint(float64(inputfSize * 8) / float64(countRunes))})
tableOfStats.Append([]string{"Bits per rune in output file", fmt.Sprint(float64(outputfSize * 8) / float64(countRunes))})
table.Render()
tableOfCodes.Render()
tableOfStats.Render()
}
func decode(inputf *os.File, outputf *os.File) {
dec := gob.NewDecoder(inputf)
var codes = new(huffmanCodes)
var countBitsForAllText int
err := dec.Decode(codes)
check(err)
err = dec.Decode(&countBitsForAllText)
check(err)
fmt.Println(codes)
fmt.Println(countBitsForAllText)
}
func main() {
flag.Usage = usage
flag.Parse()
args := flag.Args()
if len(args) < 1 {
fmt.Println("Command is missing.")
os.Exit(1)
}
if len(args) < 2 {
fmt.Println("Input file is missing.")
os.Exit(1)
}
if len(args) < 3 {
fmt.Println("Output file is missing.")
os.Exit(1)
}
inputPath := os.Args[2]
outputPath := os.Args[3]
inputf, inputerr := os.Open(inputPath)
check(inputerr)
outputf, outputerr := os.Create(outputPath)
check(outputerr)
defer inputf.Close()
defer outputf.Close()
if os.Args[1] == "encode" {
encode(MODE_ONE, inputf, outputf)
} else if os.Args[1] == "encode2" {
encode(MODE_TWO, inputf, outputf)
} else if os.Args[1] == "decode" {
decode(inputf, outputf)
} else {
fmt.Fprintf(os.Stderr, "%s: '%s' is not a %s command.\n", os.Args[0], os.Args[1], os.Args[0])
os.Exit(1)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment