Skip to content

Instantly share code, notes, and snippets.

@toVersus
Created April 27, 2018 12:49
Show Gist options
  • Save toVersus/0b999dbf31a9d64226db9d9390ff7d86 to your computer and use it in GitHub Desktop.
Save toVersus/0b999dbf31a9d64226db9d9390ff7d86 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #72: Extract origins by removing the element matched with stop words
package main
import (
"bufio"
"flag"
"fmt"
"os"
"strings"
porterstemmer "github.com/reiver/go-porterstemmer"
)
var StopWords = []string{"i", "me", "my", "myself", "we", "our", "ours", "ourselves",
"you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself",
"she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
"what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were",
"be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing",
"a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while"}
type review struct {
origins []string
sentiment string
}
func newReviews(path string) reviews {
f, _ := os.Open(path)
defer f.Close()
sc := bufio.NewScanner(f)
reviews := reviews{}
for sc.Scan() {
words := strings.Fields(sc.Text())
review := review{
sentiment: words[0],
}
for _, word := range words[1:] {
if isStopWord(word) || isMetaCharacter(word) {
continue
}
review.origins = append(review.origins, word)
}
reviews = append(reviews, review)
}
return reviews
}
type reviews []review
func (reviews *reviews) getStem() reviews {
for _, review := range *reviews {
for i, origin := range review.origins {
review.origins[i] = porterstemmer.StemString(origin)
}
}
return *reviews
}
func main() {
var filePath string
flag.StringVar(&filePath, "file", "", "specify a file path")
flag.StringVar(&filePath, "f", "", "specify a file path")
flag.Parse()
if _, err := os.Stat(filePath); err != nil {
fmt.Fprintf(os.Stderr, "could not find a file: %s\n %#v", filePath, err)
os.Exit(1)
}
reviews := newReviews(filePath)
fmt.Printf("%#v\n", reviews.getStem())
}
func isStopWord(word string) bool {
dict := make(map[string]struct{}, len(StopWords))
for _, w := range StopWords {
dict[w] = struct{}{}
}
if _, ok := dict[word]; ok {
return ok
}
return false
}
var MetaCharacters = []string{
".", ",", ":", ";", "(", ")", "!", "?", "/", "\\",
"{", "}", "-", "_", "[", "]", "~", "'", "--", "\"",
}
func isMetaCharacter(word string) bool {
dict := make(map[string]struct{}, len(MetaCharacters))
for _, meta := range MetaCharacters {
dict[meta] = struct{}{}
}
if _, ok := dict[word]; ok {
return ok
}
return false
}
package main
import (
"bufio"
"os"
"reflect"
"strings"
"testing"
)
var extractOriginTests = []struct {
name string
file string
text string
want reviews
}{
{
name: "should extract formatted words",
file: "full-test.txt",
text: `+1 the bard as black comedy -- willie would have loved it .
-1 it's a frankenstein-monster of a film that doesn't know what it wants to be .
`,
want: reviews{
review{origins: []string{"bard", "black", "comedi", "willi", "would", "love"}, sentiment: "+1"},
review{origins: []string{"it'", "frankenstein-monst", "of", "film", "doesn't", "know", "want", "to"}, sentiment: "-1"},
},
},
}
func TestExtractOrigins(t *testing.T) {
for _, testcase := range extractOriginTests {
t.Log(testcase.name)
f, err := os.Create(testcase.file)
if err != nil {
t.Errorf("could not create a file: %s\n %s", testcase.file, err)
}
f.WriteString(testcase.text)
f.Close()
f, err = os.Open(testcase.file)
if err != nil {
t.Errorf("could not open a file: %s\n %s", testcase.file, err)
}
sc := bufio.NewScanner(f)
reviews := reviews{}
for sc.Scan() {
words := strings.Fields(sc.Text())
review := review{
sentiment: words[0],
}
for _, word := range words[1:] {
if isStopWord(word) {
continue
}
if isMetaCharacter(word) {
continue
}
review.origins = append(review.origins, word)
}
reviews = append(reviews, review)
}
result := reviews.getStem()
if !reflect.DeepEqual(result, testcase.want) {
t.Errorf("want => %#v\n expect => %#v", result, testcase.want)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment