markov_chain_gen.go
package main | |
import ( | |
"bufio" | |
"bytes" | |
"fmt" | |
"io/ioutil" | |
"math/rand" | |
"os" | |
"path" | |
"strings" | |
"github.com/e-dard/headlines" | |
) | |
const corpus = "./corpus/" | |
func main() { | |
files, err := ioutil.ReadDir(corpus) | |
perr(err) | |
var buf = &bytes.Buffer{} | |
for _, f := range files { | |
if f.IsDir() { | |
continue | |
} | |
w := readWords(path.Join(corpus, f.Name())) | |
w = cleanWords(w) | |
sentences := wordsToSentence(w) | |
buf.WriteString(strings.Join(sentences, "\n")) | |
} | |
chain := headlines.NewChain(0) | |
perr(chain.Build(buf)) | |
for i := 0; i < 10; i++ { | |
fmt.Println(chain.MustGenerate(pickSentenceSize())) | |
} | |
} | |
func pickSentenceSize() int { | |
return 10 + rand.Intn(10) | |
} | |
func wordsToSentence(words []string) []string { | |
var sentence []string | |
var buff []string | |
for _, w := range words { | |
if strings.HasSuffix(w, ".") { | |
w = strings.TrimSuffix(w, ".") | |
buff = append(buff, w) | |
sentence = append(sentence, strings.Join(buff, " ")) | |
buff = nil | |
continue | |
} | |
buff = append(buff, w) | |
} | |
if len(buff) != 0 { | |
sentence = append(sentence, strings.Join(buff, " ")) | |
} | |
return sentence | |
} | |
var suffixes = []string{",", "!", ")", "\"", "-", "*"} | |
var prefixes = []string{"(", "\"", "-", "*"} | |
func cleanWords(in []string) []string { | |
var res []string | |
for _, w := range in { | |
s := strings.TrimSpace(w) | |
if s == "" { | |
continue | |
} | |
for _, suff := range suffixes { | |
s = strings.TrimSuffix(s, suff) | |
} | |
for _, pref := range prefixes { | |
s = strings.TrimPrefix(s, pref) | |
} | |
s = strings.TrimSpace(w) | |
if s == "" { | |
continue | |
} | |
s = strings.ToLower(s) | |
res = append(res, s) | |
} | |
return res | |
} | |
func readWords(file string) []string { | |
f, err := os.Open(file) | |
defer f.Close() | |
perr(err) | |
scanner := bufio.NewScanner(f) | |
scanner.Split(bufio.ScanWords) | |
var words []string | |
for scanner.Scan() { | |
words = append(words, scanner.Text()) | |
} | |
perr(scanner.Err()) | |
return words | |
} | |
func perr(err error) { | |
if err != nil { | |
panic(err) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment