Instantly share code, notes, and snippets.

@sheki /gen.go
Last active Nov 13, 2015

Embed
What would you like to do?
markov_chain_gen.go
package main
import (
"bufio"
"bytes"
"fmt"
"io/ioutil"
"math/rand"
"os"
"path"
"strings"
"github.com/e-dard/headlines"
)
const corpus = "./corpus/"
func main() {
files, err := ioutil.ReadDir(corpus)
perr(err)
var buf = &bytes.Buffer{}
for _, f := range files {
if f.IsDir() {
continue
}
w := readWords(path.Join(corpus, f.Name()))
w = cleanWords(w)
sentences := wordsToSentence(w)
buf.WriteString(strings.Join(sentences, "\n"))
}
chain := headlines.NewChain(0)
perr(chain.Build(buf))
for i := 0; i < 10; i++ {
fmt.Println(chain.MustGenerate(pickSentenceSize()))
}
}
func pickSentenceSize() int {
return 10 + rand.Intn(10)
}
func wordsToSentence(words []string) []string {
var sentence []string
var buff []string
for _, w := range words {
if strings.HasSuffix(w, ".") {
w = strings.TrimSuffix(w, ".")
buff = append(buff, w)
sentence = append(sentence, strings.Join(buff, " "))
buff = nil
continue
}
buff = append(buff, w)
}
if len(buff) != 0 {
sentence = append(sentence, strings.Join(buff, " "))
}
return sentence
}
var suffixes = []string{",", "!", ")", "\"", "-", "*"}
var prefixes = []string{"(", "\"", "-", "*"}
func cleanWords(in []string) []string {
var res []string
for _, w := range in {
s := strings.TrimSpace(w)
if s == "" {
continue
}
for _, suff := range suffixes {
s = strings.TrimSuffix(s, suff)
}
for _, pref := range prefixes {
s = strings.TrimPrefix(s, pref)
}
s = strings.TrimSpace(w)
if s == "" {
continue
}
s = strings.ToLower(s)
res = append(res, s)
}
return res
}
func readWords(file string) []string {
f, err := os.Open(file)
defer f.Close()
perr(err)
scanner := bufio.NewScanner(f)
scanner.Split(bufio.ScanWords)
var words []string
for scanner.Scan() {
words = append(words, scanner.Text())
}
perr(scanner.Err())
return words
}
func perr(err error) {
if err != nil {
panic(err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment