Skip to content

Instantly share code, notes, and snippets.

@jbowles jbowles/easy_token.go
Last active Dec 26, 2015

Embed
What would you like to do?
Easy out of the box natural language tokenizer using Go standard library
package main
import (
"fmt"
"strings"
"unicode"
)
func main() {
str := "th0.7is7! Then I want to show Snow White and the Seven Dwarves. <=AndThe start of a new sentence. And\n then\n\nagain for One and NASA?"
var cont = make(map[string][]string, 0)
for _, v := range str {
switch true {
case unicode.IsTitle(v):
fmt.Println("Title:", string(v))
cont["title"] = append(cont["title"], string(v))
case unicode.IsLetter(v):
fmt.Println("Letter:", string(v))
cont["letter"] = append(cont["letter"], string(v))
case unicode.IsNumber(v):
fmt.Println("Number:", string(v))
cont["number"] = append(cont["number"], string(v))
case unicode.IsPunct(v):
fmt.Println("Punct:", string(v))
cont["punct"] = append(cont["punct"], string(v))
case unicode.IsSpace(v):
fmt.Println("Space:", string(v))
cont["letter"] = append(cont["letter"], ", ")
case unicode.IsSymbol(v):
fmt.Println("Symbol:", string(v))
cont["symbol"] = append(cont["symbol"], string(v))
}
}
cont["words"] = strings.Split(strings.Join(cont["letter"], ""), ", ")
for _, w := range cont["words"] {
fmt.Println(string(w))
}
for k, v := range cont {
fmt.Printf("Key: %v\t Value: %v\n", k, v)
}
}
/*
**** OUTPUT
Letter: t
Letter: h
Number: 0
Punct: .
Number: 7
Letter: i
Letter: s
Number: 7
Punct: !
Space:
Letter: T
Letter: h
Letter: e
Letter: n
Space:
Letter: I
Space:
Letter: w
Letter: a
Letter: n
Letter: t
Space:
Letter: t
Letter: o
Space:
Letter: s
Letter: h
Letter: o
Letter: w
Space:
Letter: S
Letter: n
Letter: o
Letter: w
Space:
Letter: W
Letter: h
Letter: i
Letter: t
Letter: e
Space:
Letter: a
Letter: n
Letter: d
Space:
Letter: t
Letter: h
Letter: e
Space:
Letter: S
Letter: e
Letter: v
Letter: e
Letter: n
Space:
Letter: D
Letter: w
Letter: a
Letter: r
Letter: v
Letter: e
Letter: s
Punct: .
Space:
Symbol: <
Symbol: =
Letter: A
Letter: n
Letter: d
Letter: T
Letter: h
Letter: e
Space:
Letter: s
Letter: t
Letter: a
Letter: r
Letter: t
Space:
Letter: o
Letter: f
Space:
Letter: a
Space:
Letter: n
Letter: e
Letter: w
Space:
Letter: s
Letter: e
Letter: n
Letter: t
Letter: e
Letter: n
Letter: c
Letter: e
Punct: .
Space:
Letter: A
Letter: n
Letter: d
Space:
Space:
Letter: t
Letter: h
Letter: e
Letter: n
Space:
Space:
Letter: a
Letter: g
Letter: a
Letter: i
Letter: n
Space:
Letter: f
Letter: o
Letter: r
Space:
Letter: O
Letter: n
Letter: e
Space:
Letter: o
Letter: n
Space:
Letter: O
Letter: n
Letter: e
Space:
Letter: a
Letter: n
Letter: d
Space:
Letter: N
Letter: A
Letter: S
Letter: A
Punct: ?
this
Then
I
want
to
show
Snow
White
and
the
Seven
Dwarves
AndThe
start
of
a
new
sentence
And
then
again
for
One
on
One
and
NASA
Key: letter Value: [t h i s , T h e n , I , w a n t , t o , s h o w , S n o w , W h i t e , a n d , t h e , S e v e n , D w a r v e s , A n d T h e , s t a r t , o f , a , n e w , s e n t e n c e , A n d , , t h e n , , a g a i n , f o r , O n e , o n , O n e , a n d , N A S A]
Key: number Value: [0 7 7]
Key: punct Value: [. ! . . ?]
Key: symbol Value: [< =]
Key: words Value: [this Then I want to show Snow White and the Seven Dwarves AndThe start of a new sentence And then again for One on One and NASA]
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.