Last active
January 28, 2020 09:04
-
-
Save CAFxX/f945d3be0d99863d47d884feb0fa56e9 to your computer and use it in GitHub Desktop.
dict.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
//"strings" | |
"io/ioutil" | |
"sync" | |
"runtime" | |
) | |
const ( | |
minLen = 4 | |
maxLen = 48 | |
) | |
func toShards(m map[string]int, s string) map[string]int { | |
if m == nil { | |
m = make(map[string]int, (maxLen-minLen)*len(s)) | |
} | |
_toShards(m, s) | |
return m | |
} | |
func _toShards(m map[string]int, s string) { | |
for i := 0; i < len(s); i++ { | |
for j := minLen; j <= maxLen && i+j<=len(s); j++ { | |
// TODO: handle overlapping strings | |
// TODO: do not consider repetitions that happen in the same window | |
m[s[i:i+j]]++ | |
} | |
} | |
} | |
func pickTopString(m map[string]int) string { | |
var topString string | |
var topScore int | |
for k := range m { | |
score := evalScore(m, k) | |
if score > topScore { | |
topString, topScore = k, score | |
} | |
} | |
return topString | |
} | |
func pickTopStringParallel(m map[string]int) string { | |
c := make([]string, 0, len(m)) | |
for k := range m { | |
c = append(c, k) | |
} | |
var topString string | |
var topScore int | |
p := runtime.GOMAXPROCS(0) | |
var chunkSize = (len(c) + p - 1) / p | |
var mutex sync.Mutex | |
var wg sync.WaitGroup | |
for i := 0; i < p; i++ { | |
start, end := i*chunkSize, (i+1)*chunkSize | |
if end > len(c) { | |
end = len(c) | |
} | |
wg.Add(1) | |
go func() { | |
defer wg.Done() | |
var locString string | |
var locScore int | |
for _, k := range c[start:end] { | |
score := evalScore(m, k) | |
if score > locScore { | |
locString, locScore = k, score | |
} | |
} | |
mutex.Lock() | |
defer mutex.Unlock() | |
if locScore > topScore { | |
topString, topScore = locString, locScore | |
} | |
}() | |
} | |
wg.Wait() | |
return topString | |
} | |
func evalScore(m map[string]int, k string) int { | |
const refSize = 3 | |
ck := m[k] | |
score := ck * (len(k) - refSize) | |
for i := 0; i < len(k); i++ { | |
for j := minLen; j <= maxLen && i+j<=len(k); j++ { | |
s := k[i:i+j] | |
if cs := m[s]; cs > ck { | |
score += (cs-ck) * (len(s) - refSize) | |
} | |
} | |
} | |
return score / len(k) | |
} | |
func removeString(m map[string]int, s string) { | |
for i := 0; i < len(s); i++ { | |
for j := minLen; j <= maxLen && i+j<=len(s); j++ { | |
delete(m, s[i:i+j]) | |
} | |
} | |
} | |
func dropUnlikely(n map[string]int, thr int) { | |
for k, v := range n { | |
if v < thr { | |
delete(n, k) | |
} | |
} | |
} | |
func BuildDictionary(s []string, maxSize int) []string { | |
n := make(map[string]int) | |
for i, k := range s { | |
if i % 1000 == 0 { | |
dropUnlikely(n, i/1000) | |
fmt.Printf("> %d/%d %d \r", i, len(s), len(n)) | |
} | |
n = toShards(n, k) | |
} | |
dropUnlikely(n, len(s)/1000) | |
dropUnlikely(n, 2) | |
var e []string | |
size := 0 | |
for { | |
k := pickTopStringParallel(n) | |
if k == "" || len(k)+size > maxSize { | |
break | |
} | |
removeString(n, k) | |
e = append(e, k) | |
size += len(k) | |
fmt.Println(k) | |
} | |
return e | |
} | |
func main() { | |
/* | |
f, err := ioutil.ReadFile("citylots.json") | |
if err != nil { | |
panic(err) | |
} | |
s := strings.Split((string)(f), "\n") | |
*/ | |
f, err := ioutil.ReadFile("master_get_all.c.json") | |
if err != nil { | |
panic(err) | |
} | |
s := []string{(string)(f)} | |
d := BuildDictionary(s, 4096) | |
for i, k := range d { | |
fmt.Printf("%5d: %q\n", i, k) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment