Skip to content

Instantly share code, notes, and snippets.

@samuel
Last active November 12, 2015 20:25
Show Gist options
  • Save samuel/07e4c85611d14125951c to your computer and use it in GitHub Desktop.
Save samuel/07e4c85611d14125951c to your computer and use it in GitHub Desktop.
Facebook's Miny Compression
package main
import (
"fmt"
"log"
"regexp"
"sort"
"strconv"
"strings"
)
const (
magic = "Miny1"
dictionary = "wxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_"
)
var dictMap map[rune]struct{}
func init() {
dictMap = make(map[rune]struct{})
for _, d := range dictionary {
dictMap[d] = struct{}{}
}
}
var reWordsAndSpaces = regexp.MustCompile(`\w+|\W+`)
type sorter struct {
s []string
n map[string]int
}
func (s sorter) Len() int { return len(s.s) }
func (s sorter) Swap(i, j int) { s.s[i], s.s[j] = s.s[j], s.s[i] }
func (s sorter) Less(i, j int) bool { return s.n[s.s[j]] < s.n[s.s[i]] }
func encode(k string) string {
if len(k) < len(magic)+3 || strings.ContainsAny(k, `\~`) {
return k
}
// Split string into words and spaces
toks := reWordsAndSpaces.FindAllString(k, -1)
// Count occurances of each token (word or space)
counts := make(map[string]int)
for _, m := range toks {
counts[m]++
}
// Create a sorted set of tokens by descending count
uniqueSorted := make([]string, 0, len(counts))
for s := range counts {
uniqueSorted = append(uniqueSorted, s)
}
sort.Sort(sorter{s: uniqueSorted, n: counts})
mapping := make(map[string]string)
for m, om := range uniqueSorted {
p := (m - m%32) / 32
c := dictionary[m%32 : m%32+1]
if p != 0 {
mapping[om] = strconv.FormatInt(int64(p), 32) + c
} else {
mapping[om] = c
}
}
// Map tokens
mappedToks := make([]string, len(toks))
for i, m := range toks {
mappedToks[i] = mapping[m]
}
uniqueSorted = append([]string{magic, strconv.Itoa(len(uniqueSorted))}, uniqueSorted...)
uniqueSorted = append(uniqueSorted, strings.Join(mappedToks, ""))
return strings.Join(uniqueSorted, "~")
}
func decode(k string) (string, error) {
o := strings.Split(k, "~")
if len(o) < 3 || o[0] != magic {
return "", fmt.Errorf("bad magic")
}
ln, err := strconv.Atoi(o[1])
if err != nil || ln != len(o)-3 {
return "", fmt.Errorf("bad length")
}
mappedToks := o[len(o)-1]
uniqueSorted := o[2 : len(o)-1]
mapping := make(map[string]string)
for m, om := range uniqueSorted {
p := (m - m%32) / 32
c := dictionary[m%32 : m%32+1]
if p != 0 {
mapping[strconv.FormatInt(int64(p), 32)+c] = om
} else {
mapping[c] = om
}
}
toks := make([]string, 0, len(mappedToks))
start := 0
for i, c := range mappedToks {
if _, ok := dictMap[c]; ok {
toks = append(toks, mapping[mappedToks[start:i+1]])
start = i + 1
}
}
return strings.Join(toks, ""), nil
}
func main() {
// s, err := decode(`...`)
// if err != nil {
// log.Fatal(err)
// }
// println(s)
s := `Test this this test 1. 2... 3..... . ... ... .. .`
x := encode(s)
println(x)
y, err := decode(x)
if err != nil {
log.Fatal(err)
}
println(y)
if s != y {
println("FAIL")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment