Last active
December 17, 2015 20:29
-
-
Save BenLubar/5668009 to your computer and use it in GitHub Desktop.
couchbase markov chain generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"crypto/sha1" | |
"encoding/hex" | |
"flag" | |
"log" | |
"os" | |
"time" | |
"github.com/couchbaselabs/go-couchbase" | |
) | |
var Bucket *couchbase.Bucket | |
func main() { | |
flag.Parse() | |
var err error | |
Bucket, err = couchbase.GetBucket("http://127.0.0.1:18091/", "default", "markov") | |
if err != nil { | |
log.Fatalf("getting bucket: %v", err) | |
} | |
defer Bucket.Close() | |
Bucket.PutDDoc("markov", couchbase.DDocJSON{ | |
Views: map[string]couchbase.ViewDefinition{ | |
"markov": { | |
Map: `function( doc, meta ) { | |
if ( doc.Type != 'text' ) | |
return; | |
var words = ['', '', '', '', '']; | |
var text = doc.Text.split( /\s+/g ); | |
for ( var i = 0; i < text.length; i++ ) { | |
var word = text[i]; | |
emit( words, word ); | |
words.shift(); | |
words.push( word ); | |
} | |
emit( words, '' ); | |
}`, | |
Reduce: `function( k, v, rereduce ) { | |
var counts = {}; | |
if ( rereduce ) | |
for ( var i = 0; i < v.length; i++ ) | |
for ( var j = 0; j < v[i].length; j++ ) { | |
counts[v[i][j].w] = counts[v[i][j].w] || 0; | |
counts[v[i][j].w] += v[i][j].c; | |
} | |
else | |
for ( var i = 0; i < v.length; i++ ) { | |
counts[v[i]] = counts[v[i]] || 0; | |
counts[v[i]]++; | |
} | |
var result = []; | |
for ( var w in counts ) { | |
result.push( {w: w, c: counts[w]} ); | |
} | |
return result.sort( function( a, b ) { | |
return b.c - a.c; | |
} ); | |
}`, | |
}, | |
}, | |
}) | |
sha := sha1.New() | |
for _, fn := range flag.Args() { | |
f, err := os.Open(fn) | |
if err != nil { | |
log.Fatalf("opening %s: %v", fn, err) | |
} | |
s := bufio.NewScanner(f) | |
for s.Scan() { | |
sha.Reset() | |
sha.Write(s.Bytes()) | |
key := hex.EncodeToString(sha.Sum(nil)) | |
line := s.Text() | |
for { | |
err := Bucket.Set(key, 0, struct { | |
Type, Text string | |
}{ | |
Type: "text", | |
Text: line, | |
}) | |
if err != nil { | |
log.Printf("%s: %v", key, err) | |
time.Sleep(time.Second) | |
} else { | |
break | |
} | |
} | |
} | |
if err = s.Err(); err != nil { | |
log.Fatalf("reading %s: %v", fn, err) | |
} | |
f.Close() | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"math/rand" | |
"strings" | |
"sync" | |
) | |
const PrefixLength = 2 | |
type Doc struct { | |
Text string | |
} | |
type MapResult struct { | |
Prefix [PrefixLength]string | |
Suffix string | |
} | |
type ReduceResult struct { | |
Total int | |
Suffix map[string]int | |
} | |
func Input(docs chan<- Doc) { | |
defer close(docs) | |
lines := []string{ | |
"I am not a number!", | |
"I am a free man!", | |
} | |
for _, line := range lines { | |
docs <- Doc{Text: line} | |
} | |
} | |
func Map(docs <-chan Doc, emit chan<- MapResult) { | |
defer close(emit) | |
var wg sync.WaitGroup | |
const workers = 4 | |
wg.Add(workers) | |
for i := 0; i < workers; i++ { | |
go Mapper(docs, emit, &wg) | |
} | |
wg.Wait() | |
} | |
func Mapper(docs <-chan Doc, emit chan<- MapResult, wg *sync.WaitGroup) { | |
defer wg.Done() | |
for doc := range docs { | |
var result MapResult | |
for _, word := range strings.Fields(doc.Text) { | |
result.Suffix = word | |
emit <- result | |
copy(result.Prefix[1:], result.Prefix[:]) | |
result.Prefix[0] = word | |
} | |
result.Suffix = "" | |
emit <- result | |
} | |
} | |
func Reduce(emit <-chan MapResult, key [PrefixLength]string) ReduceResult { | |
output := make(chan ReduceResult) | |
var ( | |
inputs []MapResult | |
outputs []ReduceResult | |
waiting int | |
) | |
reduce: | |
for { | |
select { | |
case in, ok := <-emit: | |
if !ok { | |
if len(inputs) > 0 { | |
waiting++ | |
go Reducer(inputs, output) | |
inputs = nil | |
} | |
break reduce | |
} | |
if in.Prefix != key { | |
continue | |
} | |
inputs = append(inputs, in) | |
if len(inputs) >= 10 { | |
waiting++ | |
go Reducer(inputs, output) | |
inputs = nil | |
} | |
case out := <-output: | |
waiting-- | |
outputs = append(outputs, out) | |
if len(outputs) >= 10 { | |
waiting++ | |
go Rereducer(outputs, output) | |
outputs = nil | |
} | |
} | |
} | |
for waiting > 0 { | |
out := <-output | |
waiting-- | |
outputs = append(outputs, out) | |
if len(outputs) >= 10 { | |
waiting++ | |
go Rereducer(outputs, output) | |
outputs = nil | |
} | |
} | |
go Rereducer(outputs, output) | |
return <-output | |
} | |
func Reducer(emit []MapResult, output chan<- ReduceResult) { | |
var result ReduceResult | |
result.Suffix = make(map[string]int) | |
for _, r := range emit { | |
result.Total++ | |
result.Suffix[r.Suffix]++ | |
} | |
output <- result | |
} | |
func Rereducer(emit []ReduceResult, output chan<- ReduceResult) { | |
var result ReduceResult | |
result.Suffix = make(map[string]int) | |
for _, r := range emit { | |
result.Total += r.Total | |
for k, v := range r.Suffix { | |
result.Suffix[k] += v | |
} | |
} | |
output <- result | |
} | |
func ChainLink(r *rand.Rand, prefix [PrefixLength]string) string { | |
docs := make(chan Doc) | |
emit := make(chan MapResult) | |
go Input(docs) | |
go Map(docs, emit) | |
possible := Reduce(emit, prefix) | |
n := r.Intn(possible.Total) | |
for p, c := range possible.Suffix { | |
n -= c | |
if n < 0 { | |
return p | |
} | |
} | |
return "" | |
} | |
func Chain(r *rand.Rand) string { | |
var prefix [PrefixLength]string | |
var words []string | |
for { | |
word := ChainLink(r, prefix) | |
if word == "" { | |
return strings.Join(words, " ") | |
} | |
words = append(words, word) | |
copy(prefix[1:], prefix[:]) | |
prefix[0] = word | |
} | |
} | |
func main() { | |
r := rand.New(rand.NewSource(0)) | |
for i := 0; i < 10; i++ { | |
fmt.Println(Chain(r)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment