Skip to content

Instantly share code, notes, and snippets.

@CAFxX
Last active January 28, 2020 09:04
Show Gist options
  • Save CAFxX/f945d3be0d99863d47d884feb0fa56e9 to your computer and use it in GitHub Desktop.
Save CAFxX/f945d3be0d99863d47d884feb0fa56e9 to your computer and use it in GitHub Desktop.
dict.go
package main
import (
"fmt"
//"strings"
"io/ioutil"
"sync"
"runtime"
)
const (
minLen = 4
maxLen = 48
)
func toShards(m map[string]int, s string) map[string]int {
if m == nil {
m = make(map[string]int, (maxLen-minLen)*len(s))
}
_toShards(m, s)
return m
}
func _toShards(m map[string]int, s string) {
for i := 0; i < len(s); i++ {
for j := minLen; j <= maxLen && i+j<=len(s); j++ {
// TODO: handle overlapping strings
// TODO: do not consider repetitions that happen in the same window
m[s[i:i+j]]++
}
}
}
func pickTopString(m map[string]int) string {
var topString string
var topScore int
for k := range m {
score := evalScore(m, k)
if score > topScore {
topString, topScore = k, score
}
}
return topString
}
func pickTopStringParallel(m map[string]int) string {
c := make([]string, 0, len(m))
for k := range m {
c = append(c, k)
}
var topString string
var topScore int
p := runtime.GOMAXPROCS(0)
var chunkSize = (len(c) + p - 1) / p
var mutex sync.Mutex
var wg sync.WaitGroup
for i := 0; i < p; i++ {
start, end := i*chunkSize, (i+1)*chunkSize
if end > len(c) {
end = len(c)
}
wg.Add(1)
go func() {
defer wg.Done()
var locString string
var locScore int
for _, k := range c[start:end] {
score := evalScore(m, k)
if score > locScore {
locString, locScore = k, score
}
}
mutex.Lock()
defer mutex.Unlock()
if locScore > topScore {
topString, topScore = locString, locScore
}
}()
}
wg.Wait()
return topString
}
func evalScore(m map[string]int, k string) int {
const refSize = 3
ck := m[k]
score := ck * (len(k) - refSize)
for i := 0; i < len(k); i++ {
for j := minLen; j <= maxLen && i+j<=len(k); j++ {
s := k[i:i+j]
if cs := m[s]; cs > ck {
score += (cs-ck) * (len(s) - refSize)
}
}
}
return score / len(k)
}
func removeString(m map[string]int, s string) {
for i := 0; i < len(s); i++ {
for j := minLen; j <= maxLen && i+j<=len(s); j++ {
delete(m, s[i:i+j])
}
}
}
func dropUnlikely(n map[string]int, thr int) {
for k, v := range n {
if v < thr {
delete(n, k)
}
}
}
func BuildDictionary(s []string, maxSize int) []string {
n := make(map[string]int)
for i, k := range s {
if i % 1000 == 0 {
dropUnlikely(n, i/1000)
fmt.Printf("> %d/%d %d \r", i, len(s), len(n))
}
n = toShards(n, k)
}
dropUnlikely(n, len(s)/1000)
dropUnlikely(n, 2)
var e []string
size := 0
for {
k := pickTopStringParallel(n)
if k == "" || len(k)+size > maxSize {
break
}
removeString(n, k)
e = append(e, k)
size += len(k)
fmt.Println(k)
}
return e
}
func main() {
/*
f, err := ioutil.ReadFile("citylots.json")
if err != nil {
panic(err)
}
s := strings.Split((string)(f), "\n")
*/
f, err := ioutil.ReadFile("master_get_all.c.json")
if err != nil {
panic(err)
}
s := []string{(string)(f)}
d := BuildDictionary(s, 4096)
for i, k := range d {
fmt.Printf("%5d: %q\n", i, k)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment